Data exploration of survey dataset

We begin by loading the required libraries and the dataset

flight_exploration <- read.csv("Train.csv", header = TRUE)
str(flight_exploration)
## 'data.frame':    103904 obs. of  25 variables:
##  $ X                                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ id                               : int  70172 5047 110028 24026 119299 111157 82113 96462 79485 65725 ...
##  $ Gender                           : chr  "Male" "Male" "Female" "Female" ...
##  $ Customer.Type                    : chr  "Loyal Customer" "disloyal Customer" "Loyal Customer" "Loyal Customer" ...
##  $ Age                              : int  13 25 26 25 61 26 47 52 41 20 ...
##  $ Type.of.Travel                   : chr  "Personal Travel" "Business travel" "Business travel" "Business travel" ...
##  $ Class                            : chr  "Eco Plus" "Business" "Business" "Business" ...
##  $ Flight.Distance                  : int  460 235 1142 562 214 1180 1276 2035 853 1061 ...
##  $ Inflight.wifi.service            : int  3 3 2 2 3 3 2 4 1 3 ...
##  $ Departure.Arrival.time.convenient: int  4 2 2 5 3 4 4 3 2 3 ...
##  $ Ease.of.Online.booking           : int  3 3 2 5 3 2 2 4 2 3 ...
##  $ Gate.location                    : int  1 3 2 5 3 1 3 4 2 4 ...
##  $ Food.and.drink                   : int  5 1 5 2 4 1 2 5 4 2 ...
##  $ Online.boarding                  : int  3 3 5 2 5 2 2 5 3 3 ...
##  $ Seat.comfort                     : int  5 1 5 2 5 1 2 5 3 3 ...
##  $ Inflight.entertainment           : int  5 1 5 2 3 1 2 5 1 2 ...
##  $ On.board.service                 : int  4 1 4 2 3 3 3 5 1 2 ...
##  $ Leg.room.service                 : int  3 5 3 5 4 4 3 5 2 3 ...
##  $ Baggage.handling                 : int  4 3 4 3 4 4 4 5 1 4 ...
##  $ Checkin.service                  : int  4 1 4 1 3 4 3 4 4 4 ...
##  $ Inflight.service                 : int  5 4 4 4 3 4 5 5 1 3 ...
##  $ Cleanliness                      : int  5 1 5 2 3 1 2 4 2 2 ...
##  $ Departure.Delay.in.Minutes       : int  25 1 0 11 0 0 9 4 0 0 ...
##  $ Arrival.Delay.in.Minutes         : num  18 6 0 9 0 0 23 0 0 0 ...
##  $ satisfaction                     : chr  "neutral or dissatisfied" "neutral or dissatisfied" "satisfied" "neutral or dissatisfied" ...

Distribution of scores received on services

The services are grouped in three categories- inflight serives, airport services, and online services.

#Classifying the experience into Airport,Inflight and online
Group <- rename(flight_exploration, Inflight1 = 'Seat.comfort', 
              Airport1 = 'Departure.Arrival.time.convenient',
              Inflight2 ='Food.and.drink', 
              Airport2 = 'Gate.location',
              Inflight3 = 'Inflight.wifi.service',
              Inflight4 = 'Inflight.entertainment',
              Online1 = 'Ease.of.Online.booking',
              Inflight5 = 'On.board.service',
              Inflight6 = 'Leg.room.service',
              Airport3 = 'Baggage.handling',
              Airport4 = 'Checkin.service',
              Inflight7 = Cleanliness,
              Online3 = 'Online.boarding')

#Output of the grouping
Group <- Group %>% 
  mutate(InflightExperience =rowMeans(Group %>% select(starts_with("Inflight"))),
         AirportExperience=rowMeans(Group %>% select(starts_with("Airport"))),
         OnlineExperience=rowMeans(Group %>% select(starts_with("Online"))),
         male=ifelse(Gender == 1,0,1))

glimpse(Group)
## Rows: 103,904
## Columns: 29
## $ X                          <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
## $ id                         <int> 70172, 5047, 110028, 24026, 119299, 111157,…
## $ Gender                     <chr> "Male", "Male", "Female", "Female", "Male",…
## $ Customer.Type              <chr> "Loyal Customer", "disloyal Customer", "Loy…
## $ Age                        <int> 13, 25, 26, 25, 61, 26, 47, 52, 41, 20, 24,…
## $ Type.of.Travel             <chr> "Personal Travel", "Business travel", "Busi…
## $ Class                      <chr> "Eco Plus", "Business", "Business", "Busine…
## $ Flight.Distance            <int> 460, 235, 1142, 562, 214, 1180, 1276, 2035,…
## $ Inflight3                  <int> 3, 3, 2, 2, 3, 3, 2, 4, 1, 3, 4, 2, 1, 4, 3…
## $ Airport1                   <int> 4, 2, 2, 5, 3, 4, 4, 3, 2, 3, 5, 4, 4, 2, 2…
## $ Online1                    <int> 3, 3, 2, 5, 3, 2, 2, 4, 2, 3, 5, 2, 4, 4, 3…
## $ Airport2                   <int> 1, 3, 2, 5, 3, 1, 3, 4, 2, 4, 4, 2, 4, 3, 2…
## $ Inflight2                  <int> 5, 1, 5, 2, 4, 1, 2, 5, 4, 2, 2, 1, 1, 4, 2…
## $ Online3                    <int> 3, 3, 5, 2, 5, 2, 2, 5, 3, 3, 5, 2, 1, 4, 3…
## $ Inflight1                  <int> 5, 1, 5, 2, 5, 1, 2, 5, 3, 3, 2, 1, 1, 4, 2…
## $ Inflight4                  <int> 5, 1, 5, 2, 3, 1, 2, 5, 1, 2, 2, 1, 1, 4, 2…
## $ Inflight5                  <int> 4, 1, 4, 2, 3, 3, 3, 5, 1, 2, 3, 1, 1, 4, 4…
## $ Inflight6                  <int> 3, 5, 3, 5, 4, 4, 3, 5, 2, 3, 3, 2, 1, 5, 3…
## $ Airport3                   <int> 4, 3, 4, 3, 4, 4, 4, 5, 1, 4, 5, 5, 3, 2, 2…
## $ Airport4                   <int> 4, 1, 4, 1, 3, 4, 3, 4, 4, 4, 3, 5, 4, 2, 2…
## $ Inflight.service           <int> 5, 4, 4, 4, 3, 4, 5, 5, 1, 3, 5, 5, 4, 2, 1…
## $ Inflight7                  <int> 5, 1, 5, 2, 3, 1, 2, 4, 2, 2, 2, 1, 1, 4, 2…
## $ Departure.Delay.in.Minutes <int> 25, 1, 0, 11, 0, 0, 9, 4, 0, 0, 0, 0, 28, 0…
## $ Arrival.Delay.in.Minutes   <dbl> 18, 6, 0, 9, 0, 0, 23, 0, 0, 0, 0, 0, 8, 0,…
## $ satisfaction               <chr> "neutral or dissatisfied", "neutral or diss…
## $ InflightExperience         <dbl> 4.375, 2.125, 4.125, 2.625, 3.500, 2.250, 2…
## $ AirportExperience          <dbl> 3.25, 2.25, 3.00, 3.50, 3.25, 3.25, 3.50, 4…
## $ OnlineExperience           <dbl> 3.0, 3.0, 3.5, 3.5, 4.0, 2.0, 2.0, 4.5, 2.5…
## $ male                       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
# Plot of inflight Experience
ggplot(Group) +
  aes(x = InflightExperience) +
  geom_histogram(aes(y = ..density..)) +
  geom_density()+
  labs(title = "Histogram of Inflight Experience Satisfaction Scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Plot of Airport experience
ggplot(Group) +
  aes(x = AirportExperience) +
  geom_histogram(aes(y = ..density..)) +
  geom_density()+
  labs(title = "Histogram of Airport Experience Satisfaction Scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Plot of Online experience
ggplot(Group) +
  aes(x = OnlineExperience) +
  geom_histogram(aes(y = ..density..)) +
  geom_density()+
  labs(title = "Histogram of Online Experience Satisfaction Scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Outliers in scores

Gathering and renaming for better visualization

sat <- gather(flight_exploration, 'Seat.comfort', 'Departure.Arrival.time.convenient', 'Food.and.drink', 'Gate.location', 'Inflight.wifi.service', 'Inflight.entertainment', 'Ease.of.Online.booking', 'On.board.service', 'Leg.room.service','Baggage.handling', 'Checkin.service', Cleanliness, 'Online.boarding', key = "Criteria", value = "Satisfaction_Scale")

sat$Criteria[sat$Criteria=="Seat.comfort"] <- "A1"
sat$Criteria[sat$Criteria=="Departure.Arrival.time.convenient"] <- "A2"
sat$Criteria[sat$Criteria=="Food.and.drink"] <- "A3"
sat$Criteria[sat$Criteria=="Gate.location"] <- "A4"
sat$Criteria[sat$Criteria=="Inflight.wifi.service"] <- "A5"
sat$Criteria[sat$Criteria=="Inflight.entertainment"] <- "A6"
sat$Criteria[sat$Criteria=="Ease.of.Online.booking"] <- "A7"
sat$Criteria[sat$Criteria=="On.board.service"] <- "A8"
sat$Criteria[sat$Criteria=="Leg.room.service"] <- "A9"
sat$Criteria[sat$Criteria=="Baggage.handling"] <- "A10"
sat$Criteria[sat$Criteria=="Checkin.service"] <- "A11"
sat$Criteria[sat$Criteria=="Cleanliness"] <- "A12"
sat$Criteria[sat$Criteria=="Online.boarding"] <- "A13"

sat$Criteria <- as.factor(sat$Criteria)
glimpse(sat)
## Rows: 1,350,752
## Columns: 14
## $ X                          <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
## $ id                         <int> 70172, 5047, 110028, 24026, 119299, 111157,…
## $ Gender                     <chr> "Male", "Male", "Female", "Female", "Male",…
## $ Customer.Type              <chr> "Loyal Customer", "disloyal Customer", "Loy…
## $ Age                        <int> 13, 25, 26, 25, 61, 26, 47, 52, 41, 20, 24,…
## $ Type.of.Travel             <chr> "Personal Travel", "Business travel", "Busi…
## $ Class                      <chr> "Eco Plus", "Business", "Business", "Busine…
## $ Flight.Distance            <int> 460, 235, 1142, 562, 214, 1180, 1276, 2035,…
## $ Inflight.service           <int> 5, 4, 4, 4, 3, 4, 5, 5, 1, 3, 5, 5, 4, 2, 1…
## $ Departure.Delay.in.Minutes <int> 25, 1, 0, 11, 0, 0, 9, 4, 0, 0, 0, 0, 28, 0…
## $ Arrival.Delay.in.Minutes   <dbl> 18, 6, 0, 9, 0, 0, 23, 0, 0, 0, 0, 0, 8, 0,…
## $ satisfaction               <chr> "neutral or dissatisfied", "neutral or diss…
## $ Criteria                   <fct> A1, A1, A1, A1, A1, A1, A1, A1, A1, A1, A1,…
## $ Satisfaction_Scale         <int> 5, 1, 5, 2, 5, 1, 2, 5, 3, 3, 2, 1, 1, 4, 2…

Now, a box plot is plotted to check the services which had lower scores than the average scores

sat %>% 
  mutate(class = fct_reorder(Criteria, Satisfaction_Scale, .fun = 'mean')) %>% 
  ggplot(aes(x=reorder(Criteria, Satisfaction_Scale), y = Satisfaction_Scale,  fill = Criteria)) + 
  geom_boxplot()+
  stat_summary(fun.y = "mean", geom = "point", shape = 10, size = 3,fill = "Yellow") +
  geom_hline(aes(yintercept = mean(Satisfaction_Scale)), linetype="dashed",color = "Orange", size = 1.2)+
  scale_fill_brewer() +
  labs(title = "Boxplot illustrating the Mean of Satisfaction Level",
       caption = "BoxPlot",
       x = "Satisfaction Criteria",
       y = "Satisfaction Scale")+
  theme(legend.position = "none")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

Scores of services by satisfied and disatified passengers

# inflight wifi 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Inflight.wifi.service)) +
  geom_boxplot()

# departure arrival time convenient 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Departure.Arrival.time.convenient)) +
  geom_boxplot()

# ease of online booking 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Ease.of.Online.booking)) +
  geom_boxplot()

# gate location 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Gate.location)) +
  geom_boxplot()

# food and drink 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Food.and.drink)) +
  geom_boxplot()

# online boarding
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Online.boarding)) +
  geom_boxplot()

# seat comfort
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Seat.comfort)) +
  geom_boxplot()

# inflight entertainment
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Inflight.entertainment)) +
  geom_boxplot()

# on board services 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = On.board.service)) +
  geom_boxplot()

# leg room 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Leg.room.service)) +
  geom_boxplot()

# baggage handling
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Baggage.handling)) +
  geom_boxplot()

# checkin service 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Checkin.service)) +
  geom_boxplot()

# inflight service 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Inflight.service)) +
  geom_boxplot()

# cleanliness 
ggplot(flight_exploration, 
      aes(x = satisfaction, 
            y = Cleanliness)) +
  geom_boxplot()

Classification using survey dataset

We load the necessary libraries and store the flight survey dataset as dataframe

flight_survey <- flight_exploration
str(flight_survey)
## 'data.frame':    103904 obs. of  25 variables:
##  $ X                                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ id                               : int  70172 5047 110028 24026 119299 111157 82113 96462 79485 65725 ...
##  $ Gender                           : chr  "Male" "Male" "Female" "Female" ...
##  $ Customer.Type                    : chr  "Loyal Customer" "disloyal Customer" "Loyal Customer" "Loyal Customer" ...
##  $ Age                              : int  13 25 26 25 61 26 47 52 41 20 ...
##  $ Type.of.Travel                   : chr  "Personal Travel" "Business travel" "Business travel" "Business travel" ...
##  $ Class                            : chr  "Eco Plus" "Business" "Business" "Business" ...
##  $ Flight.Distance                  : int  460 235 1142 562 214 1180 1276 2035 853 1061 ...
##  $ Inflight.wifi.service            : int  3 3 2 2 3 3 2 4 1 3 ...
##  $ Departure.Arrival.time.convenient: int  4 2 2 5 3 4 4 3 2 3 ...
##  $ Ease.of.Online.booking           : int  3 3 2 5 3 2 2 4 2 3 ...
##  $ Gate.location                    : int  1 3 2 5 3 1 3 4 2 4 ...
##  $ Food.and.drink                   : int  5 1 5 2 4 1 2 5 4 2 ...
##  $ Online.boarding                  : int  3 3 5 2 5 2 2 5 3 3 ...
##  $ Seat.comfort                     : int  5 1 5 2 5 1 2 5 3 3 ...
##  $ Inflight.entertainment           : int  5 1 5 2 3 1 2 5 1 2 ...
##  $ On.board.service                 : int  4 1 4 2 3 3 3 5 1 2 ...
##  $ Leg.room.service                 : int  3 5 3 5 4 4 3 5 2 3 ...
##  $ Baggage.handling                 : int  4 3 4 3 4 4 4 5 1 4 ...
##  $ Checkin.service                  : int  4 1 4 1 3 4 3 4 4 4 ...
##  $ Inflight.service                 : int  5 4 4 4 3 4 5 5 1 3 ...
##  $ Cleanliness                      : int  5 1 5 2 3 1 2 4 2 2 ...
##  $ Departure.Delay.in.Minutes       : int  25 1 0 11 0 0 9 4 0 0 ...
##  $ Arrival.Delay.in.Minutes         : num  18 6 0 9 0 0 23 0 0 0 ...
##  $ satisfaction                     : chr  "neutral or dissatisfied" "neutral or dissatisfied" "satisfied" "neutral or dissatisfied" ...
summary(flight_survey)
##        X                id            Gender          Customer.Type     
##  Min.   :     0   Min.   :     1   Length:103904      Length:103904     
##  1st Qu.: 25976   1st Qu.: 32534   Class :character   Class :character  
##  Median : 51952   Median : 64856   Mode  :character   Mode  :character  
##  Mean   : 51952   Mean   : 64924                                        
##  3rd Qu.: 77927   3rd Qu.: 97368                                        
##  Max.   :103903   Max.   :129880                                        
##                                                                         
##       Age        Type.of.Travel        Class           Flight.Distance
##  Min.   : 7.00   Length:103904      Length:103904      Min.   :  31   
##  1st Qu.:27.00   Class :character   Class :character   1st Qu.: 414   
##  Median :40.00   Mode  :character   Mode  :character   Median : 843   
##  Mean   :39.38                                         Mean   :1189   
##  3rd Qu.:51.00                                         3rd Qu.:1743   
##  Max.   :85.00                                         Max.   :4983   
##                                                                       
##  Inflight.wifi.service Departure.Arrival.time.convenient Ease.of.Online.booking
##  Min.   :0.00          Min.   :0.00                      Min.   :0.000         
##  1st Qu.:2.00          1st Qu.:2.00                      1st Qu.:2.000         
##  Median :3.00          Median :3.00                      Median :3.000         
##  Mean   :2.73          Mean   :3.06                      Mean   :2.757         
##  3rd Qu.:4.00          3rd Qu.:4.00                      3rd Qu.:4.000         
##  Max.   :5.00          Max.   :5.00                      Max.   :5.000         
##                                                                                
##  Gate.location   Food.and.drink  Online.boarding  Seat.comfort  
##  Min.   :0.000   Min.   :0.000   Min.   :0.00    Min.   :0.000  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.00    1st Qu.:2.000  
##  Median :3.000   Median :3.000   Median :3.00    Median :4.000  
##  Mean   :2.977   Mean   :3.202   Mean   :3.25    Mean   :3.439  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.00    3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.00    Max.   :5.000  
##                                                                 
##  Inflight.entertainment On.board.service Leg.room.service Baggage.handling
##  Min.   :0.000          Min.   :0.000    Min.   :0.000    Min.   :1.000   
##  1st Qu.:2.000          1st Qu.:2.000    1st Qu.:2.000    1st Qu.:3.000   
##  Median :4.000          Median :4.000    Median :4.000    Median :4.000   
##  Mean   :3.358          Mean   :3.382    Mean   :3.351    Mean   :3.632   
##  3rd Qu.:4.000          3rd Qu.:4.000    3rd Qu.:4.000    3rd Qu.:5.000   
##  Max.   :5.000          Max.   :5.000    Max.   :5.000    Max.   :5.000   
##                                                                           
##  Checkin.service Inflight.service  Cleanliness    Departure.Delay.in.Minutes
##  Min.   :0.000   Min.   :0.00     Min.   :0.000   Min.   :   0.00           
##  1st Qu.:3.000   1st Qu.:3.00     1st Qu.:2.000   1st Qu.:   0.00           
##  Median :3.000   Median :4.00     Median :3.000   Median :   0.00           
##  Mean   :3.304   Mean   :3.64     Mean   :3.286   Mean   :  14.82           
##  3rd Qu.:4.000   3rd Qu.:5.00     3rd Qu.:4.000   3rd Qu.:  12.00           
##  Max.   :5.000   Max.   :5.00     Max.   :5.000   Max.   :1592.00           
##                                                                             
##  Arrival.Delay.in.Minutes satisfaction      
##  Min.   :   0.00          Length:103904     
##  1st Qu.:   0.00          Class :character  
##  Median :   0.00          Mode  :character  
##  Mean   :  15.18                            
##  3rd Qu.:  13.00                            
##  Max.   :1584.00                            
##  NA's   :310

Data preprocessing

Here we remove the id columns, change the type of the variables, and removing the outliers

# removing X and id 
flight_survey <- flight_survey[, -c(1, 2)]

# changing the characters to factors
flight_survey$Gender <- as.factor(flight_survey$Gender)
flight_survey$Customer.Type <-as.factor(flight_survey$Customer.Type)
flight_survey$Type.of.Travel<-as.factor(flight_survey$Type.of.Travel)
flight_survey$Class<-as.factor(flight_survey$Class)
flight_survey$satisfaction<-as.factor(flight_survey$satisfaction)

# changing the labels 
levels(flight_survey$satisfaction) <- c("dissatisfied", "satisfied")

# removing outliers 
iqr <- IQR(flight_survey$Departure.Delay.in.Minutes) #removing outliers in departure delays
Q <- quantile(flight_survey$Departure.Delay.in.Minutes, probs=c(.25, .75), na.rm = FALSE)
flight_survey<- subset(flight_survey, flight_survey$Departure.Delay.in.Minutes > (Q[1] - 2.5*iqr) & flight_survey$Departure.Delay.in.Minutes < (Q[2]+2.5*iqr))
iqr2 <- IQR(flight_survey$Arrival.Delay.in.Minutes, na.rm = TRUE) #removing outliers in arrival delays
Q1 <- quantile(flight_survey$Arrival.Delay.in.Minutes, probs=c(.25, .75), na.rm = TRUE)
flight_survey<- subset(flight_survey, flight_survey$Arrival.Delay.in.Minutes > (Q1[1] - 2.5*iqr2) & flight_survey$Arrival.Delay.in.Minutes < (Q1[2]+2.5*iqr2))

str(flight_survey)
## 'data.frame':    83620 obs. of  23 variables:
##  $ Gender                           : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 1 1 1 2 1 ...
##  $ Customer.Type                    : Factor w/ 2 levels "disloyal Customer",..: 2 1 2 2 2 2 2 2 1 1 ...
##  $ Age                              : int  13 25 26 25 61 26 52 41 20 24 ...
##  $ Type.of.Travel                   : Factor w/ 2 levels "Business travel",..: 2 1 1 1 1 2 1 1 1 1 ...
##  $ Class                            : Factor w/ 3 levels "Business","Eco",..: 3 1 1 1 1 2 1 1 2 2 ...
##  $ Flight.Distance                  : int  460 235 1142 562 214 1180 2035 853 1061 1182 ...
##  $ Inflight.wifi.service            : int  3 3 2 2 3 3 4 1 3 4 ...
##  $ Departure.Arrival.time.convenient: int  4 2 2 5 3 4 3 2 3 5 ...
##  $ Ease.of.Online.booking           : int  3 3 2 5 3 2 4 2 3 5 ...
##  $ Gate.location                    : int  1 3 2 5 3 1 4 2 4 4 ...
##  $ Food.and.drink                   : int  5 1 5 2 4 1 5 4 2 2 ...
##  $ Online.boarding                  : int  3 3 5 2 5 2 5 3 3 5 ...
##  $ Seat.comfort                     : int  5 1 5 2 5 1 5 3 3 2 ...
##  $ Inflight.entertainment           : int  5 1 5 2 3 1 5 1 2 2 ...
##  $ On.board.service                 : int  4 1 4 2 3 3 5 1 2 3 ...
##  $ Leg.room.service                 : int  3 5 3 5 4 4 5 2 3 3 ...
##  $ Baggage.handling                 : int  4 3 4 3 4 4 5 1 4 5 ...
##  $ Checkin.service                  : int  4 1 4 1 3 4 4 4 4 3 ...
##  $ Inflight.service                 : int  5 4 4 4 3 4 5 1 3 5 ...
##  $ Cleanliness                      : int  5 1 5 2 3 1 4 2 2 2 ...
##  $ Departure.Delay.in.Minutes       : int  25 1 0 11 0 0 4 0 0 0 ...
##  $ Arrival.Delay.in.Minutes         : num  18 6 0 9 0 0 0 0 0 0 ...
##  $ satisfaction                     : Factor w/ 2 levels "dissatisfied",..: 1 1 2 1 2 1 2 1 1 1 ...
# removing the missing value
colSums(is.na(flight_survey)) 
##                            Gender                     Customer.Type 
##                                 0                                 0 
##                               Age                    Type.of.Travel 
##                                 0                                 0 
##                             Class                   Flight.Distance 
##                                 0                                 0 
##             Inflight.wifi.service Departure.Arrival.time.convenient 
##                                 0                                 0 
##            Ease.of.Online.booking                     Gate.location 
##                                 0                                 0 
##                    Food.and.drink                   Online.boarding 
##                                 0                                 0 
##                      Seat.comfort            Inflight.entertainment 
##                                 0                                 0 
##                  On.board.service                  Leg.room.service 
##                                 0                                 0 
##                  Baggage.handling                   Checkin.service 
##                                 0                                 0 
##                  Inflight.service                       Cleanliness 
##                                 0                                 0 
##        Departure.Delay.in.Minutes          Arrival.Delay.in.Minutes 
##                                 0                                 0 
##                      satisfaction 
##                                 0
flight_survey<- na.omit(flight_survey)

Logistic Regression

Training the model
mdl1 <- glm(satisfaction~.-Departure.Delay.in.Minutes , data=flight_survey, family="binomial")
options(scipen=999)
summary(mdl1)
## 
## Call:
## glm(formula = satisfaction ~ . - Departure.Delay.in.Minutes, 
##     family = "binomial", data = flight_survey)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7617  -0.5037  -0.1701   0.3960   3.9350  
## 
## Coefficients:
##                                      Estimate  Std. Error z value
## (Intercept)                       -7.48464535  0.08709011 -85.941
## GenderMale                         0.04862715  0.02149084   2.263
## Customer.TypeLoyal Customer        2.09080861  0.03315771  63.056
## Age                               -0.00961006  0.00077830 -12.347
## Type.of.TravelPersonal Travel     -2.67385210  0.03417067 -78.250
## ClassEco                          -0.71976384  0.02846077 -25.290
## ClassEco Plus                     -0.77550378  0.04563738 -16.993
## Flight.Distance                   -0.00001733  0.00001264  -1.371
## Inflight.wifi.service              0.42483003  0.01288161  32.980
## Departure.Arrival.time.convenient -0.12386715  0.00885696 -13.985
## Ease.of.Online.booking            -0.18162891  0.01263668 -14.373
## Gate.location                      0.03870844  0.00995289   3.889
## Food.and.drink                    -0.05826911  0.01209818  -4.816
## Online.boarding                    0.59979578  0.01125019  53.314
## Seat.comfort                       0.08437149  0.01222310   6.903
## Inflight.entertainment             0.03514923  0.01638286   2.145
## On.board.service                   0.31410850  0.01137476  27.615
## Leg.room.service                   0.25728844  0.00933544  27.560
## Baggage.handling                   0.12367191  0.01255443   9.851
## Checkin.service                    0.30743826  0.00937486  32.794
## Inflight.service                   0.09777242  0.01339472   7.299
## Cleanliness                        0.24029470  0.01346982  17.839
## Arrival.Delay.in.Minutes          -0.05026857  0.00223788 -22.463
##                                               Pr(>|z|)    
## (Intercept)                       < 0.0000000000000002 ***
## GenderMale                                    0.023655 *  
## Customer.TypeLoyal Customer       < 0.0000000000000002 ***
## Age                               < 0.0000000000000002 ***
## Type.of.TravelPersonal Travel     < 0.0000000000000002 ***
## ClassEco                          < 0.0000000000000002 ***
## ClassEco Plus                     < 0.0000000000000002 ***
## Flight.Distance                               0.170325    
## Inflight.wifi.service             < 0.0000000000000002 ***
## Departure.Arrival.time.convenient < 0.0000000000000002 ***
## Ease.of.Online.booking            < 0.0000000000000002 ***
## Gate.location                                 0.000101 ***
## Food.and.drink                       0.000001462064512 ***
## Online.boarding                   < 0.0000000000000002 ***
## Seat.comfort                         0.000000000005105 ***
## Inflight.entertainment                        0.031914 *  
## On.board.service                  < 0.0000000000000002 ***
## Leg.room.service                  < 0.0000000000000002 ***
## Baggage.handling                  < 0.0000000000000002 ***
## Checkin.service                   < 0.0000000000000002 ***
## Inflight.service                     0.000000000000289 ***
## Cleanliness                       < 0.0000000000000002 ***
## Arrival.Delay.in.Minutes          < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 115147  on 83619  degrees of freedom
## Residual deviance:  56852  on 83597  degrees of freedom
## AIC: 56898
## 
## Number of Fisher Scoring iterations: 5

Using step regression to

library(caret)
#training control for cross validation
tr <- trainControl(method="cv", number=10)

#using feature selection 
mdl2 <- step(mdl1, direction="both", trainControl=tr)
## Start:  AIC=56898.29
## satisfaction ~ (Gender + Customer.Type + Age + Type.of.Travel + 
##     Class + Flight.Distance + Inflight.wifi.service + Departure.Arrival.time.convenient + 
##     Ease.of.Online.booking + Gate.location + Food.and.drink + 
##     Online.boarding + Seat.comfort + Inflight.entertainment + 
##     On.board.service + Leg.room.service + Baggage.handling + 
##     Checkin.service + Inflight.service + Cleanliness + Departure.Delay.in.Minutes + 
##     Arrival.Delay.in.Minutes) - Departure.Delay.in.Minutes
## 
##                                     Df Deviance   AIC
## - Flight.Distance                    1    56854 56898
## <none>                                    56852 56898
## - Inflight.entertainment             1    56857 56901
## - Gender                             1    56857 56901
## - Gate.location                      1    56867 56911
## - Food.and.drink                     1    56876 56920
## - Seat.comfort                       1    56900 56944
## - Inflight.service                   1    56906 56950
## - Baggage.handling                   1    56950 56994
## - Age                                1    57006 57050
## - Departure.Arrival.time.convenient  1    57047 57091
## - Ease.of.Online.booking             1    57061 57105
## - Cleanliness                        1    57172 57216
## - Arrival.Delay.in.Minutes           1    57371 57415
## - Class                              2    57552 57594
## - Leg.room.service                   1    57617 57661
## - On.board.service                   1    57631 57675
## - Checkin.service                    1    57961 58005
## - Inflight.wifi.service              1    57980 58024
## - Online.boarding                    1    59868 59912
## - Customer.Type                      1    61229 61273
## - Type.of.Travel                     1    63961 64005
## 
## Step:  AIC=56898.17
## satisfaction ~ Gender + Customer.Type + Age + Type.of.Travel + 
##     Class + Inflight.wifi.service + Departure.Arrival.time.convenient + 
##     Ease.of.Online.booking + Gate.location + Food.and.drink + 
##     Online.boarding + Seat.comfort + Inflight.entertainment + 
##     On.board.service + Leg.room.service + Baggage.handling + 
##     Checkin.service + Inflight.service + Cleanliness + Arrival.Delay.in.Minutes
## 
##                                     Df Deviance   AIC
## <none>                                    56854 56898
## + Flight.Distance                    1    56852 56898
## - Inflight.entertainment             1    56859 56901
## - Gender                             1    56859 56901
## - Gate.location                      1    56869 56911
## - Food.and.drink                     1    56877 56919
## - Seat.comfort                       1    56901 56943
## - Inflight.service                   1    56908 56950
## - Baggage.handling                   1    56952 56994
## - Age                                1    57006 57048
## - Departure.Arrival.time.convenient  1    57050 57092
## - Ease.of.Online.booking             1    57063 57105
## - Cleanliness                        1    57174 57216
## - Arrival.Delay.in.Minutes           1    57373 57415
## - Class                              2    57614 57654
## - Leg.room.service                   1    57617 57659
## - On.board.service                   1    57632 57674
## - Checkin.service                    1    57963 58005
## - Inflight.wifi.service              1    57990 58032
## - Online.boarding                    1    59868 59910
## - Customer.Type                      1    61505 61547
## - Type.of.Travel                     1    64053 64095
summary(mdl2)
## 
## Call:
## glm(formula = satisfaction ~ Gender + Customer.Type + Age + Type.of.Travel + 
##     Class + Inflight.wifi.service + Departure.Arrival.time.convenient + 
##     Ease.of.Online.booking + Gate.location + Food.and.drink + 
##     Online.boarding + Seat.comfort + Inflight.entertainment + 
##     On.board.service + Leg.room.service + Baggage.handling + 
##     Checkin.service + Inflight.service + Cleanliness + Arrival.Delay.in.Minutes, 
##     family = "binomial", data = flight_survey)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7671  -0.5036  -0.1704   0.3959   3.9239  
## 
## Coefficients:
##                                     Estimate Std. Error z value
## (Intercept)                       -7.5046728  0.0858980 -87.367
## GenderMale                         0.0486408  0.0214906   2.263
## Customer.TypeLoyal Customer        2.0788437  0.0319685  65.028
## Age                               -0.0095416  0.0007766 -12.286
## Type.of.TravelPersonal Travel     -2.6682199  0.0339176 -78.668
## ClassEco                          -0.7077195  0.0270699 -26.144
## ClassEco Plus                     -0.7610253  0.0443803 -17.148
## Inflight.wifi.service              0.4257968  0.0128655  33.096
## Departure.Arrival.time.convenient -0.1240473  0.0088546 -14.009
## Ease.of.Online.booking            -0.1819713  0.0126371 -14.400
## Gate.location                      0.0388148  0.0099518   3.900
## Food.and.drink                    -0.0580765  0.0120987  -4.800
## Online.boarding                    0.5993384  0.0112455  53.296
## Seat.comfort                       0.0841054  0.0122212   6.882
## Inflight.entertainment             0.0350499  0.0163826   2.139
## On.board.service                   0.3138995  0.0113705  27.606
## Leg.room.service                   0.2568505  0.0093286  27.534
## Baggage.handling                   0.1239279  0.0125491   9.875
## Checkin.service                    0.3073560  0.0093740  32.788
## Inflight.service                   0.0980817  0.0133884   7.326
## Cleanliness                        0.2402937  0.0134716  17.837
## Arrival.Delay.in.Minutes          -0.0502807  0.0022379 -22.467
##                                               Pr(>|z|)    
## (Intercept)                       < 0.0000000000000002 ***
## GenderMale                                      0.0236 *  
## Customer.TypeLoyal Customer       < 0.0000000000000002 ***
## Age                               < 0.0000000000000002 ***
## Type.of.TravelPersonal Travel     < 0.0000000000000002 ***
## ClassEco                          < 0.0000000000000002 ***
## ClassEco Plus                     < 0.0000000000000002 ***
## Inflight.wifi.service             < 0.0000000000000002 ***
## Departure.Arrival.time.convenient < 0.0000000000000002 ***
## Ease.of.Online.booking            < 0.0000000000000002 ***
## Gate.location                        0.000096082221676 ***
## Food.and.drink                       0.000001584776660 ***
## Online.boarding                   < 0.0000000000000002 ***
## Seat.comfort                         0.000000000005905 ***
## Inflight.entertainment                          0.0324 *  
## On.board.service                  < 0.0000000000000002 ***
## Leg.room.service                  < 0.0000000000000002 ***
## Baggage.handling                  < 0.0000000000000002 ***
## Checkin.service                   < 0.0000000000000002 ***
## Inflight.service                     0.000000000000237 ***
## Cleanliness                       < 0.0000000000000002 ***
## Arrival.Delay.in.Minutes          < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 115147  on 83619  degrees of freedom
## Residual deviance:  56854  on 83598  degrees of freedom
## AIC: 56898
## 
## Number of Fisher Scoring iterations: 5

After running step, we get the same model

Testing the model

Testing data pre-processing

#loading the testing dataset
flight_survey.test <- read.csv('test.csv', header=TRUE)

colSums(is.na(flight_survey.test)) #only four observations that are missing values.
##                                 X                                id 
##                                 0                                 0 
##                            Gender                     Customer.Type 
##                                 0                                 0 
##                               Age                    Type.of.Travel 
##                                 0                                 0 
##                             Class                   Flight.Distance 
##                                 0                                 0 
##             Inflight.wifi.service Departure.Arrival.time.convenient 
##                                 0                                 0 
##            Ease.of.Online.booking                     Gate.location 
##                                 0                                 0 
##                    Food.and.drink                   Online.boarding 
##                                 0                                 0 
##                      Seat.comfort            Inflight.entertainment 
##                                 0                                 0 
##                  On.board.service                  Leg.room.service 
##                                 0                                 0 
##                  Baggage.handling                   Checkin.service 
##                                 0                                 0 
##                  Inflight.service                       Cleanliness 
##                                 0                                 0 
##        Departure.Delay.in.Minutes          Arrival.Delay.in.Minutes 
##                                 0                                83 
##                      satisfaction 
##                                 0
flight_survey.test<- na.omit(flight_survey.test)

# removing X and id 
flight_survey.test <- flight_survey.test[, -c(1, 2)]



# data pre-processing
flight_survey.test$Gender <- as.factor(flight_survey.test$Gender)
flight_survey.test$Customer.Type <-as.factor(flight_survey.test$Customer.Type)
flight_survey.test$Type.of.Travel<-as.factor(flight_survey.test$Type.of.Travel)
flight_survey.test$Class<-as.factor(flight_survey.test$Class)
flight_survey.test$satisfaction<-as.factor(flight_survey.test$satisfaction)
levels(flight_survey.test$satisfaction) <- c("dissatisfied", "satisfied")

str(flight_survey.test)
## 'data.frame':    25893 obs. of  23 variables:
##  $ Gender                           : Factor w/ 2 levels "Female","Male": 1 1 2 2 1 2 1 1 2 1 ...
##  $ Customer.Type                    : Factor w/ 2 levels "disloyal Customer",..: 2 2 1 2 2 2 2 2 2 2 ...
##  $ Age                              : int  52 36 20 44 49 16 77 43 47 46 ...
##  $ Type.of.Travel                   : Factor w/ 2 levels "Business travel",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Class                            : Factor w/ 3 levels "Business","Eco",..: 2 1 2 1 2 2 1 1 2 1 ...
##  $ Flight.Distance                  : int  160 2863 192 3377 1182 311 3987 2556 556 1744 ...
##  $ Inflight.wifi.service            : int  5 1 2 0 2 3 5 2 5 2 ...
##  $ Departure.Arrival.time.convenient: int  4 1 0 0 3 3 5 2 2 2 ...
##  $ Ease.of.Online.booking           : int  3 3 2 0 4 3 5 2 2 2 ...
##  $ Gate.location                    : int  4 1 4 2 3 3 5 2 2 2 ...
##  $ Food.and.drink                   : int  3 5 2 3 4 5 3 4 5 3 ...
##  $ Online.boarding                  : int  4 4 2 4 1 5 5 4 5 4 ...
##  $ Seat.comfort                     : int  3 5 2 4 2 3 5 5 5 4 ...
##  $ Inflight.entertainment           : int  5 4 2 1 2 5 5 4 5 4 ...
##  $ On.board.service                 : int  5 4 4 1 2 4 5 4 2 4 ...
##  $ Leg.room.service                 : int  5 4 1 1 2 3 5 4 2 4 ...
##  $ Baggage.handling                 : int  5 4 3 1 2 1 5 4 5 4 ...
##  $ Checkin.service                  : int  2 3 2 3 4 1 4 5 3 5 ...
##  $ Inflight.service                 : int  5 4 2 1 2 2 5 4 3 4 ...
##  $ Cleanliness                      : int  5 5 2 4 4 5 3 3 5 4 ...
##  $ Departure.Delay.in.Minutes       : int  50 0 0 0 0 0 0 77 1 28 ...
##  $ Arrival.Delay.in.Minutes         : num  44 0 0 6 20 0 0 65 0 14 ...
##  $ satisfaction                     : Factor w/ 2 levels "dissatisfied",..: 2 2 1 2 2 2 2 2 2 2 ...
#predicting the satisfaction level 
mdl1.pred <- predict(mdl1, flight_survey.test[,-23], type="response")

# first 5 actual and predicted records 
data.frame(actual=flight_survey.test$satisfaction[1:5], predicted=mdl1.pred[1:5])
##         actual  predicted
## 1    satisfied 0.69941391
## 2    satisfied 0.87769925
## 3 dissatisfied 0.03870806
## 4    satisfied 0.31487670
## 5    satisfied 0.03355939

Setting the cutoff as 0.5

#classification by putting cut of 0.5
trainEstimatedResponse = ifelse(mdl1.pred > 0.5, "satisfied", "dissatisfied")
class(trainEstimatedResponse)
## [1] "character"
levels(as.factor(trainEstimatedResponse))
## [1] "dissatisfied" "satisfied"
# Accuracy, Estimation 
table(flight_survey.test$satisfaction, trainEstimatedResponse)
##               trainEstimatedResponse
##                dissatisfied satisfied
##   dissatisfied        13286      1242
##   satisfied            2586      8779
mean(trainEstimatedResponse==flight_survey.test$satisfaction)
## [1] 0.8521608
Calculating Accuracy
library(caret)
library(e1071)

# confusion matrix
confusionMatrix(as.factor(ifelse(mdl1.pred>0.5, 'satisfied', 'dissatisfied')), 
                flight_survey.test$satisfaction, positive = "satisfied")
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     dissatisfied satisfied
##   dissatisfied        13286      2586
##   satisfied            1242      8779
##                                                
##                Accuracy : 0.8522               
##                  95% CI : (0.8478, 0.8565)     
##     No Information Rate : 0.5611               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.6959               
##                                                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.7725               
##             Specificity : 0.9145               
##          Pos Pred Value : 0.8761               
##          Neg Pred Value : 0.8371               
##              Prevalence : 0.4389               
##          Detection Rate : 0.3390               
##    Detection Prevalence : 0.3870               
##       Balanced Accuracy : 0.8435               
##                                                
##        'Positive' Class : satisfied            
## 
#computing accuracy per cutoff to select the best cutoff 
accT = c() 
for (cut in seq(0,1,0.1)){
  cm <- confusionMatrix(as.factor(ifelse(mdl1.pred>cut, 'satisfied', 'dissatisfied')), 
                flight_survey.test$satisfaction, positive = "satisfied")
  accT = c(accT, cm$overall[[1]])
}
## Warning in confusionMatrix.default(as.factor(ifelse(mdl1.pred > cut,
## "satisfied", : Levels are not in the same order for reference and data.
## Refactoring data to match.

## Warning in confusionMatrix.default(as.factor(ifelse(mdl1.pred > cut,
## "satisfied", : Levels are not in the same order for reference and data.
## Refactoring data to match.
# plot accuracy 
plot(accT ~ seq(0,1,0.1), xlab = "Cutoff Value", ylab = "", type = "l", ylim = c(0, 1))
lines(1-accT ~ seq(0,1,0.1), type = "l", lty = 2)
legend("topright",  c("accuracy", "overall error"), lty = c(1, 2), merge = TRUE)

#plotting the ROC curve 
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
test_roc = roc(flight_survey.test$satisfaction ~ mdl1.pred, plot = TRUE, print.auc = T)
## Setting levels: control = dissatisfied, case = satisfied
## Setting direction: controls < cases

# compute auc
auc(test_roc)
## Area under the curve: 0.902

Selecting cutoff as 0.5 only, as the overall accuracy is high with the cut off and area under the curve is 92.6%

Decision Tree

Creating default tree using rpart()

# default tree 
set.seed(321)
default.ct <- rpart(satisfaction ~ .-Departure.Delay.in.Minutes , data = flight_survey, method = "class")
names(default.ct)
##  [1] "frame"               "where"               "call"               
##  [4] "terms"               "cptable"             "method"             
##  [7] "parms"               "control"             "functions"          
## [10] "numresp"             "splits"              "csplit"             
## [13] "variable.importance" "y"                   "ordered"
#summary(default.ct)
default.ct$variable.importance
##        Online.boarding  Inflight.wifi.service           Seat.comfort 
##            14286.64627            11975.61158             6185.04966 
## Ease.of.Online.booking                  Class         Type.of.Travel 
##             6174.78151             5433.58107             5082.20430 
## Inflight.entertainment                    Age       Leg.room.service 
##             4433.65373              388.46768              125.52325 
##         Food.and.drink            Cleanliness 
##               24.42408               11.61714
length(default.ct$frame$var[default.ct$frame$var == "<leaf>"])
## [1] 6
# Plot tree
prp(default.ct, type = 2, extra = 1, under = TRUE, split.font = 1, varlen = -10,  box.palette=c("red", "green"))

Checking the accuracy of the model

#Results

# Training data
default.ct.point.pred.train <- predict(default.ct, flight_survey[, -23],type = "class")
confusionMatrix(default.ct.point.pred.train, as.factor(flight_survey$satisfaction),  positive = "satisfied")
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     dissatisfied satisfied
##   dissatisfied        39916      3575
##   satisfied            5917     34212
##                                                
##                Accuracy : 0.8865               
##                  95% CI : (0.8843, 0.8886)     
##     No Information Rate : 0.5481               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7721               
##                                                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9054               
##             Specificity : 0.8709               
##          Pos Pred Value : 0.8526               
##          Neg Pred Value : 0.9178               
##              Prevalence : 0.4519               
##          Detection Rate : 0.4091               
##    Detection Prevalence : 0.4799               
##       Balanced Accuracy : 0.8881               
##                                                
##        'Positive' Class : satisfied            
## 
# Testing data
default.ct.point.pred.test <- predict(default.ct,flight_survey.test[, -23],type = "class")
confusionMatrix(default.ct.point.pred.test, as.factor(flight_survey.test$satisfaction),  positive = "satisfied")
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     dissatisfied satisfied
##   dissatisfied        12561      1042
##   satisfied            1967     10323
##                                                
##                Accuracy : 0.8838               
##                  95% CI : (0.8798, 0.8877)     
##     No Information Rate : 0.5611               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7661               
##                                                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9083               
##             Specificity : 0.8646               
##          Pos Pred Value : 0.8400               
##          Neg Pred Value : 0.9234               
##              Prevalence : 0.4389               
##          Detection Rate : 0.3987               
##    Detection Prevalence : 0.4746               
##       Balanced Accuracy : 0.8865               
##                                                
##        'Positive' Class : satisfied            
## 

Next, we use the cross validation proceedure, which gives an overfitted model

# using cp as 0.00001
cv.ct <- rpart(satisfaction ~ .-Departure.Delay.in.Minutes , data = flight_survey, method = "class", 
               cp = 0.00001, minsplit = 5, xval=5)

prp(cv.ct, type = 2, extra = 1, under = TRUE, split.font = 1, varlen = -10,  box.palette=c("red", "green"))
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

cv.ct.point.pred.test <- predict(cv.ct,flight_survey.test[, -23],type = "class")
confusionMatrix(cv.ct.point.pred.test, as.factor(flight_survey.test$satisfaction),  positive = "satisfied")
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     dissatisfied satisfied
##   dissatisfied        13879       669
##   satisfied             649     10696
##                                              
##                Accuracy : 0.9491             
##                  95% CI : (0.9464, 0.9517)   
##     No Information Rate : 0.5611             
##     P-Value [Acc > NIR] : <0.0000000000000002
##                                              
##                   Kappa : 0.8966             
##                                              
##  Mcnemar's Test P-Value : 0.6007             
##                                              
##             Sensitivity : 0.9411             
##             Specificity : 0.9553             
##          Pos Pred Value : 0.9428             
##          Neg Pred Value : 0.9540             
##              Prevalence : 0.4389             
##          Detection Rate : 0.4131             
##    Detection Prevalence : 0.4381             
##       Balanced Accuracy : 0.9482             
##                                              
##        'Positive' Class : satisfied          
## 
#printcp(cv.ct)
# pruning the tree to avoid overfitting 
pruned.ct <- prune(cv.ct, 
                   cp = cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
length(pruned.ct$frame$var[pruned.ct$frame$var == "<leaf>"])
## [1] 225
prp(pruned.ct, type = 2, extra = 1, split.font = 1, varlen = -10, box.palette=c("red", "green"))  
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

pruned.ct.point.pred.test <- predict(pruned.ct,flight_survey.test[, -23],type = "class")
confusionMatrix(pruned.ct.point.pred.test, as.factor(flight_survey.test$satisfaction),  positive = "satisfied")
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     dissatisfied satisfied
##   dissatisfied        14119       713
##   satisfied             409     10652
##                                                
##                Accuracy : 0.9567               
##                  95% CI : (0.9541, 0.9591)     
##     No Information Rate : 0.5611               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.9118               
##                                                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9373               
##             Specificity : 0.9718               
##          Pos Pred Value : 0.9630               
##          Neg Pred Value : 0.9519               
##              Prevalence : 0.4389               
##          Detection Rate : 0.4114               
##    Detection Prevalence : 0.4272               
##       Balanced Accuracy : 0.9546               
##                                                
##        'Positive' Class : satisfied            
## 
#pruning using a lower cp
set.seed(1234)
pruned.ct1 <- prune(cv.ct, cp=0.006)
prp(pruned.ct1, type = 1, extra = 1, under = TRUE, split.font = 1, varlen = -10, 
    box.palette=c("red", "green")) 

pruned.ct.point.pred.test1 <- predict(pruned.ct1,flight_survey.test[, -23],type = "class")
confusionMatrix(pruned.ct.point.pred.test1, as.factor(flight_survey.test$satisfaction),  positive = "satisfied")
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     dissatisfied satisfied
##   dissatisfied        12930       668
##   satisfied            1598     10697
##                                                
##                Accuracy : 0.9125               
##                  95% CI : (0.909, 0.9159)      
##     No Information Rate : 0.5611               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.8239               
##                                                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9412               
##             Specificity : 0.8900               
##          Pos Pred Value : 0.8700               
##          Neg Pred Value : 0.9509               
##              Prevalence : 0.4389               
##          Detection Rate : 0.4131               
##    Detection Prevalence : 0.4748               
##       Balanced Accuracy : 0.9156               
##                                                
##        'Positive' Class : satisfied            
## 
# plotting the ROC for the final prunned tree 
library("ROCR")
Pred.cart = predict(pruned.ct1, newdata = flight_survey.test[, -23], type = "prob")[,2] 
Pred2 = prediction(Pred.cart, flight_survey.test$satisfaction) 
plot(performance(Pred2, "tpr", "fpr"))
abline(0, 1, lty = 2)

auc = performance(Pred2, 'auc')
slot(auc, 'y.values')
## [[1]]
## [1] 0.9543035

We can also create a random forest to increase the accuracy of the model

# random forest 
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
rf <- randomForest(as.factor(satisfaction) ~.-Departure.Delay.in.Minutes  , data = flight_survey, ntree = 500, 
                   mtry = 4, nodesize = 5, importance = TRUE, parms = list(loss = lossmatrix))  

# important variables 
varImpPlot(rf)

rf.predict <- predict(rf ,flight_survey.test[, -23],type = "class")
confusionMatrix(rf.predict, as.factor(flight_survey.test$satisfaction))
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     dissatisfied satisfied
##   dissatisfied        14218       660
##   satisfied             310     10705
##                                                
##                Accuracy : 0.9625               
##                  95% CI : (0.9602, 0.9648)     
##     No Information Rate : 0.5611               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.9237               
##                                                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9787               
##             Specificity : 0.9419               
##          Pos Pred Value : 0.9556               
##          Neg Pred Value : 0.9719               
##              Prevalence : 0.5611               
##          Detection Rate : 0.5491               
##    Detection Prevalence : 0.5746               
##       Balanced Accuracy : 0.9603               
##                                                
##        'Positive' Class : dissatisfied         
## 
#plotting the ROC curve 
predictions <- as.numeric(predict(rf, flight_survey.test[, -23], type="response"))
pred <- prediction(predictions, flight_survey.test$satisfaction)
perf <- performance(pred, measure = "tpr", x.measure = "fpr") 
plot(perf, col=rainbow(10))

auc1<- performance(pred,"auc")
print(auc1)
## A performance instance
##   'Area under the ROC curve'
slot(auc1, 'y.values')
## [[1]]
## [1] 0.9603193

Ensemble Model

In the base layer we added logistic model, decision tree, and random forest

small.index <- createDataPartition(flight_survey$satisfaction, p = 0.05, list = FALSE)
flight_survey.small <- flight_survey[small.index, ]

set.seed(4321)

control_stacking <- trainControl(method="repeatedcv", number=5, repeats=2, savePredictions=TRUE, classProbs=TRUE)

algorithms_to_use <- c('rpart', 'glm', 'rf')

stacked_models <- caretList(satisfaction ~.-Departure.Delay.in.Minutes , data = flight_survey.small, trControl = control_stacking, methodList = algorithms_to_use)
## Warning in trControlCheck(x = trControl, y = target): x$savePredictions == TRUE
## is depreciated. Setting to 'final' instead.
## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble will
## have the same resampling indexes.
stacking_results <- resamples(stacked_models)

summary(stacking_results)
## 
## Call:
## summary.resamples(object = stacking_results)
## 
## Models: rpart, glm, rf 
## Number of resamples: 10 
## 
## Accuracy 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## rpart 0.8112306 0.8304919 0.8529573 0.8457705 0.8618850 0.8696172    0
## glm   0.8480861 0.8576980 0.8714115 0.8656150 0.8744019 0.8769415    0
## rf    0.9222488 0.9336918 0.9408245 0.9383062 0.9434809 0.9498208    0
## 
## Kappa 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## rpart 0.6145735 0.6538248 0.7024074 0.6865406 0.7193875 0.7367637    0
## glm   0.6917614 0.7112468 0.7394045 0.7276708 0.7455144 0.7521925    0
## rf    0.8433139 0.8663683 0.8803616 0.8753990 0.8855278 0.8985981    0

Next, these results are stacked on the logistic regression model

stackControl <- trainControl(method="repeatedcv", number=5, repeats=3, savePredictions=TRUE, classProbs=TRUE)
set.seed(100)
glm_stack <- caretStack(stacked_models, method="glm", metric="Accuracy", trControl=stackControl)
print(glm_stack)
## A glm ensemble of 3 base models: rpart, glm, rf
## 
## Ensemble results:
## Generalized Linear Model 
## 
## 8364 samples
##    3 predictor
##    2 classes: 'dissatisfied', 'satisfied' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 6691, 6692, 6691, 6691, 6691, 6692, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9379883  0.8746998

Based ont the classification results, arrival delays lead to reduction in satisfaction probability. Therefore, next we explore the arrival delays to see what factors contribute the most in delays.

Data Exploration of Arrival Delay dataset

We begin by loading the dataset

flight_delay <- read.csv('delayml.csv')
flight_delay <- na.omit(flight_delay) 
str(flight_delay)
## 'data.frame':    228528 obs. of  31 variables:
##  $ YEAR               : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ MONTH              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DAY                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DAY_OF_WEEK        : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ AIRLINE            : chr  "NK" "NK" "HA" "B6" ...
##  $ FLIGHT_NUMBER      : int  597 168 17 1030 2134 2276 1057 425 89 328 ...
##  $ TAIL_NUMBER        : chr  "N528NK" "N629NK" "N389HA" "N239JB" ...
##  $ ORIGIN_AIRPORT     : chr  "MSP" "PHX" "LAS" "BQN" ...
##  $ DESTINATION_AIRPORT: chr  "FLL" "ORD" "HNL" "MCO" ...
##  $ SCHEDULED_DEPARTURE: int  115 125 145 307 400 438 515 520 520 530 ...
##  $ DEPARTURE_TIME     : int  127 237 145 304 535 550 703 620 618 623 ...
##  $ DEPARTURE_DELAY    : int  12 72 0 -3 95 72 108 60 58 53 ...
##  $ TAXI_OUT           : int  14 9 16 25 9 15 15 13 19 32 ...
##  $ WHEELS_OFF         : int  141 246 201 329 544 605 718 633 637 655 ...
##  $ SCHEDULED_TIME     : int  207 204 370 173 185 241 161 150 141 125 ...
##  $ ELAPSED_TIME       : int  220 175 385 196 175 258 155 150 137 138 ...
##  $ AIR_TIME           : int  166 156 361 160 163 237 133 132 111 96 ...
##  $ DISTANCE           : int  1487 1440 2762 1129 1189 1666 1121 1009 964 641 ...
##  $ WHEELS_ON          : int  527 622 602 509 727 902 1031 945 928 931 ...
##  $ TAXI_IN            : int  40 10 8 11 3 6 7 5 7 10 ...
##  $ SCHEDULED_ARRIVAL  : int  542 549 555 500 605 739 856 850 841 835 ...
##  $ ARRIVAL_TIME       : int  607 632 610 520 730 908 1038 950 935 941 ...
##  $ ARRIVAL_DELAY      : int  25 43 15 20 85 89 102 60 54 66 ...
##  $ DIVERTED           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CANCELLED          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CANCELLATION_REASON: chr  "" "" "" "" ...
##  $ AIR_SYSTEM_DELAY   : int  25 43 0 20 0 17 0 0 0 13 ...
##  $ SECURITY_DELAY     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AIRLINE_DELAY      : int  0 0 15 0 85 72 0 60 54 53 ...
##  $ LATE_AIRCRAFT_DELAY: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ WEATHER_DELAY      : int  0 0 0 0 0 0 102 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:820047] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:820047] "1" "2" "3" "4" ...

Next, we pre-process the data

flight_delay$DAY_OF_WEEK <- as.factor(flight_delay$DAY_OF_WEEK)

Distribution of arrival delays

ggplot(flight_delay, aes(ARRIVAL_DELAY)) +
  geom_histogram(aes(y=..count..),
                 fill="#c7ceea",
                 alpha = 0.8,
                 color="black", 
                 bins = 30) +
  labs(x = "Arrival Delay", y = "Frequency")

Correlation

# departure delays and arrival delays
ggplot(flight_delay, aes(x=DEPARTURE_DELAY, y=ARRIVAL_DELAY)) + geom_point()

# taxi in and arrival delays 
ggplot(flight_delay, aes(x=TAXI_IN, y=ARRIVAL_DELAY)) + geom_point()

# taxi out and arrival delays
ggplot(flight_delay, aes(x=TAXI_OUT, y=ARRIVAL_DELAY)) + geom_point()

Linear Regression Model

Dividing data in training and testing

set.seed(123)
sample_size = round(nrow(flight_delay)*.80) # 80/20 rule
train_ind <- sample(seq_len(nrow(flight_delay)), size = sample_size)
flight_delay_train <- flight_delay[train_ind,]
flight_delay_test <- flight_delay[-train_ind,]

Training the model

# setting the cross validation set
tr1 <- trainControl(method="cv", number=10)

#Model Building

myreg <- lm(ARRIVAL_DELAY ~  TAXI_OUT +  TAXI_IN+ DEPARTURE_DELAY + DISTANCE + DAY_OF_WEEK, data = flight_delay_train, trainControl=tr1)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trainControl' will be disregarded
summary(myreg)
## 
## Call:
## lm(formula = ARRIVAL_DELAY ~ TAXI_OUT + TAXI_IN + DEPARTURE_DELAY + 
##     DISTANCE + DAY_OF_WEEK, data = flight_delay_train, trainControl = tr1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.724  -6.636  -0.319   6.042 103.026 
## 
## Coefficients:
##                     Estimate   Std. Error  t value             Pr(>|t|)    
## (Intercept)     -17.75452581   0.08789556 -201.996 < 0.0000000000000002 ***
## TAXI_OUT          0.84791029   0.00163643  518.146 < 0.0000000000000002 ***
## TAXI_IN           0.86556765   0.00255199  339.174 < 0.0000000000000002 ***
## DEPARTURE_DELAY   0.95783968   0.00040322 2375.476 < 0.0000000000000002 ***
## DISTANCE         -0.00124128   0.00004415  -28.117 < 0.0000000000000002 ***
## DAY_OF_WEEK2      0.16797471   0.09452577    1.777              0.07557 .  
## DAY_OF_WEEK3      0.11942950   0.09863230    1.211              0.22595    
## DAY_OF_WEEK4      0.09653473   0.09137618    1.056              0.29076    
## DAY_OF_WEEK5     -0.23932816   0.09167694   -2.611              0.00904 ** 
## DAY_OF_WEEK6      0.75701216   0.10159176    7.452   0.0000000000000927 ***
## DAY_OF_WEEK7      1.20785985   0.09057564   13.335 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.15 on 182811 degrees of freedom
## Multiple R-squared:  0.9691, Adjusted R-squared:  0.9691 
## F-statistic: 5.737e+05 on 10 and 182811 DF,  p-value: < 0.00000000000000022
myreg_1 <- lm(ARRIVAL_DELAY ~  TAXI_OUT + TAXI_IN+ DISTANCE + DAY_OF_WEEK + DEPARTURE_DELAY + AIR_SYSTEM_DELAY + SECURITY_DELAY + AIRLINE_DELAY + WEATHER_DELAY, data = flight_delay_train, trainControl=tr)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trainControl' will be disregarded
summary(myreg_1)
## 
## Call:
## lm(formula = ARRIVAL_DELAY ~ TAXI_OUT + TAXI_IN + DISTANCE + 
##     DAY_OF_WEEK + DEPARTURE_DELAY + AIR_SYSTEM_DELAY + SECURITY_DELAY + 
##     AIRLINE_DELAY + WEATHER_DELAY, data = flight_delay_train, 
##     trainControl = tr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -110.597   -5.867    0.028    5.772  110.101 
## 
## Coefficients:
##                      Estimate   Std. Error  t value             Pr(>|t|)    
## (Intercept)      -13.92128306   0.08371675 -166.290 < 0.0000000000000002 ***
## TAXI_OUT           0.67579239   0.00177733  380.229 < 0.0000000000000002 ***
## TAXI_IN            0.69579181   0.00252829  275.202 < 0.0000000000000002 ***
## DISTANCE          -0.00147131   0.00004061  -36.230 < 0.0000000000000002 ***
## DAY_OF_WEEK2       0.18340430   0.08684220    2.112               0.0347 *  
## DAY_OF_WEEK3       0.05091912   0.09063409    0.562               0.5742    
## DAY_OF_WEEK4       0.06992588   0.08394805    0.833               0.4049    
## DAY_OF_WEEK5      -0.25467196   0.08423840   -3.023               0.0025 ** 
## DAY_OF_WEEK6       0.66058868   0.09337559    7.075      0.0000000000015 ***
## DAY_OF_WEEK7       1.05557750   0.08322058   12.684 < 0.0000000000000002 ***
## DEPARTURE_DELAY    0.91028726   0.00054981 1655.641 < 0.0000000000000002 ***
## AIR_SYSTEM_DELAY   0.20758286   0.00116709  177.863 < 0.0000000000000002 ***
## SECURITY_DELAY     0.07163513   0.01319258    5.430      0.0000000564388 ***
## AIRLINE_DELAY      0.06220866   0.00072552   85.743 < 0.0000000000000002 ***
## WEATHER_DELAY      0.07871302   0.00114855   68.533 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.25 on 182807 degrees of freedom
## Multiple R-squared:  0.9739, Adjusted R-squared:  0.9739 
## F-statistic: 4.879e+05 on 14 and 182807 DF,  p-value: < 0.00000000000000022

Testing

library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:caretEnsemble':
## 
##     autoplot
# Model 1 
pred_values = predict(myreg, newdata = flight_delay_test)

flight_delay_test$pred_ad = pred_values

all.residuals <- flight_delay_test$ARRIVAL_DELAY - pred_values
My_residuals<-data.frame("Predicted" = pred_values, "Actual" = flight_delay_test$ARRIVAL_DELAY,
           "Residual" = all.residuals)
head(My_residuals)
##     Predicted Actual   Residual
## 31   65.80590     43 -22.805896
## 53   82.08880     85   2.911204
## 56   67.15056     89  21.849441
## 107 143.33181    128 -15.331809
## 112  21.76651     25   3.233488
## 172 108.22121    116   7.778790
accuracy(pred_values, flight_delay_test$ARRIVAL_DELAY)
##                  ME     RMSE      MAE       MPE     MAPE
## Test set 0.02267564 11.17065 8.290572 -1.851053 23.88622
# Model 2 
pred_values_1 = predict(myreg_1, newdata = flight_delay_test)

flight_delay_test$pred_ad_1 = pred_values_1

all.residuals_1 <- flight_delay_test$ARRIVAL_DELAY - pred_values_1
My_residuals_1<-data.frame("Predicted" = pred_values_1, "Actual" = flight_delay_test$ARRIVAL_DELAY,
           "Residual" = all.residuals_1)
head(My_residuals_1)
##     Predicted Actual    Residual
## 31   71.53675     43 -28.5367542
## 53   84.33379     85   0.6662097
## 56   71.55769     89  17.4423051
## 107 145.50267    128 -17.5026745
## 112  22.04891     25   2.9510865
## 172 112.75542    116   3.2445843
accuracy(pred_values_1, flight_delay_test$ARRIVAL_DELAY)
##                  ME     RMSE      MAE      MPE    MAPE
## Test set 0.01640536 10.22251 7.588153 -1.80699 20.9394

Clustering Analysis

We begin by loading libraries and storing dataset as a dataframe

flight_cluster <- read.csv('test.csv', header = TRUE)

Pre processing the dataset

flight_cluster$Gender<-as.factor(flight_cluster$Gender)
flight_cluster$Customer.Type<-as.factor(flight_cluster$Customer.Type)
flight_cluster$Type.of.Travel<-as.factor(flight_cluster$Type.of.Travel)
flight_cluster$Class<-as.factor(flight_cluster$Class)
flight_cluster$satisfaction<-as.factor(flight_cluster$satisfaction)
flight_cluster$Gender<-unclass(flight_cluster$Gender)
flight_cluster$Customer.Type<-unclass(flight_cluster$Customer.Type)
flight_cluster$Type.of.Travel<-unclass(flight_cluster$Type.of.Travel)
flight_cluster$Class<-unclass(flight_cluster$Class)
flight_cluster$satisfaction<-unclass(flight_cluster$satisfaction)
flight_cluster$Arrival.Delay.in.Minutes<-as.integer(flight_cluster$Arrival.Delay.in.Minutes)
str(flight_cluster) #converting all variables to integers.
## 'data.frame':    25976 obs. of  25 variables:
##  $ X                                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ id                               : int  19556 90035 12360 77959 36875 39177 79433 97286 27508 62482 ...
##  $ Gender                           : int  1 1 2 2 1 2 1 1 2 1 ...
##   ..- attr(*, "levels")= chr [1:2] "Female" "Male"
##  $ Customer.Type                    : int  2 2 1 2 2 2 2 2 2 2 ...
##   ..- attr(*, "levels")= chr [1:2] "disloyal Customer" "Loyal Customer"
##  $ Age                              : int  52 36 20 44 49 16 77 43 47 46 ...
##  $ Type.of.Travel                   : int  1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "levels")= chr [1:2] "Business travel" "Personal Travel"
##  $ Class                            : int  2 1 2 1 2 2 1 1 2 1 ...
##   ..- attr(*, "levels")= chr [1:3] "Business" "Eco" "Eco Plus"
##  $ Flight.Distance                  : int  160 2863 192 3377 1182 311 3987 2556 556 1744 ...
##  $ Inflight.wifi.service            : int  5 1 2 0 2 3 5 2 5 2 ...
##  $ Departure.Arrival.time.convenient: int  4 1 0 0 3 3 5 2 2 2 ...
##  $ Ease.of.Online.booking           : int  3 3 2 0 4 3 5 2 2 2 ...
##  $ Gate.location                    : int  4 1 4 2 3 3 5 2 2 2 ...
##  $ Food.and.drink                   : int  3 5 2 3 4 5 3 4 5 3 ...
##  $ Online.boarding                  : int  4 4 2 4 1 5 5 4 5 4 ...
##  $ Seat.comfort                     : int  3 5 2 4 2 3 5 5 5 4 ...
##  $ Inflight.entertainment           : int  5 4 2 1 2 5 5 4 5 4 ...
##  $ On.board.service                 : int  5 4 4 1 2 4 5 4 2 4 ...
##  $ Leg.room.service                 : int  5 4 1 1 2 3 5 4 2 4 ...
##  $ Baggage.handling                 : int  5 4 3 1 2 1 5 4 5 4 ...
##  $ Checkin.service                  : int  2 3 2 3 4 1 4 5 3 5 ...
##  $ Inflight.service                 : int  5 4 2 1 2 2 5 4 3 4 ...
##  $ Cleanliness                      : int  5 5 2 4 4 5 3 3 5 4 ...
##  $ Departure.Delay.in.Minutes       : int  50 0 0 0 0 0 0 77 1 28 ...
##  $ Arrival.Delay.in.Minutes         : int  44 0 0 6 20 0 0 65 0 14 ...
##  $ satisfaction                     : int  2 2 1 2 2 2 2 2 2 2 ...
##   ..- attr(*, "levels")= chr [1:2] "neutral or dissatisfied" "satisfied"
iqr <- IQR(flight_cluster$Departure.Delay.in.Minutes) #removing outliers in departure delays
Q <- quantile(flight_cluster$Departure.Delay.in.Minutes, probs=c(.25, .75), na.rm = FALSE)
flight_cluster<- subset(flight_cluster, flight_cluster$Departure.Delay.in.Minutes > (Q[1] - 2.5*iqr) & flight_cluster$Departure.Delay.in.Minutes < (Q[2]+2.5*iqr))
iqr2 <- IQR(flight_cluster$Arrival.Delay.in.Minutes, na.rm = TRUE) #removing outliers in arrival delays
Q1 <- quantile(flight_cluster$Arrival.Delay.in.Minutes, probs=c(.25, .75), na.rm = TRUE)
flight_cluster<- subset(flight_cluster, flight_cluster$Arrival.Delay.in.Minutes > (Q1[1] - 2.5*iqr2) & flight_cluster$Arrival.Delay.in.Minutes < (Q1[2]+2.5*iqr2))
summary(flight_cluster)
##        X               id             Gender      Customer.Type  
##  Min.   :    1   Min.   :    17   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 6544   1st Qu.: 32899   1st Qu.:1.000   1st Qu.:2.000  
##  Median :12978   Median : 65920   Median :1.000   Median :2.000  
##  Mean   :13005   Mean   : 65379   Mean   :1.493   Mean   :1.818  
##  3rd Qu.:19521   3rd Qu.: 96972   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :25975   Max.   :129877   Max.   :2.000   Max.   :2.000  
##       Age        Type.of.Travel      Class       Flight.Distance
##  Min.   : 7.00   Min.   :1.000   Min.   :1.000   Min.   :  31   
##  1st Qu.:27.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 409   
##  Median :40.00   Median :1.000   Median :2.000   Median : 853   
##  Mean   :39.67   Mean   :1.308   Mean   :1.587   Mean   :1197   
##  3rd Qu.:51.00   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1750   
##  Max.   :85.00   Max.   :2.000   Max.   :3.000   Max.   :4983   
##  Inflight.wifi.service Departure.Arrival.time.convenient Ease.of.Online.booking
##  Min.   :0.000         Min.   :0.000                     Min.   :0.000         
##  1st Qu.:2.000         1st Qu.:2.000                     1st Qu.:2.000         
##  Median :3.000         Median :3.000                     Median :3.000         
##  Mean   :2.735         Mean   :3.047                     Mean   :2.758         
##  3rd Qu.:4.000         3rd Qu.:4.000                     3rd Qu.:4.000         
##  Max.   :5.000         Max.   :5.000                     Max.   :5.000         
##  Gate.location   Food.and.drink  Online.boarding  Seat.comfort 
##  Min.   :1.000   Min.   :0.000   Min.   :0.000   Min.   :1.00  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.00  
##  Median :3.000   Median :3.000   Median :4.000   Median :4.00  
##  Mean   :2.967   Mean   :3.232   Mean   :3.282   Mean   :3.47  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:5.00  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.00  
##  Inflight.entertainment On.board.service Leg.room.service Baggage.handling
##  Min.   :0.000          Min.   :0.000    Min.   :0.000    Min.   :1.000   
##  1st Qu.:2.000          1st Qu.:3.000    1st Qu.:2.000    1st Qu.:3.000   
##  Median :4.000          Median :4.000    Median :4.000    Median :4.000   
##  Mean   :3.379          Mean   :3.408    Mean   :3.356    Mean   :3.644   
##  3rd Qu.:5.000          3rd Qu.:4.000    3rd Qu.:4.000    3rd Qu.:5.000   
##  Max.   :5.000          Max.   :5.000    Max.   :5.000    Max.   :5.000   
##  Checkin.service Inflight.service  Cleanliness    Departure.Delay.in.Minutes
##  Min.   :1.000   Min.   :0.000    Min.   :1.000   Min.   : 0.000            
##  1st Qu.:3.000   1st Qu.:3.000    1st Qu.:2.000   1st Qu.: 0.000            
##  Median :3.000   Median :4.000    Median :3.000   Median : 0.000            
##  Mean   :3.331   Mean   :3.677    Mean   :3.304   Mean   : 3.103            
##  3rd Qu.:4.000   3rd Qu.:5.000    3rd Qu.:4.000   3rd Qu.: 3.000            
##  Max.   :5.000   Max.   :5.000    Max.   :5.000   Max.   :41.000            
##  Arrival.Delay.in.Minutes  satisfaction  
##  Min.   : 0.000           Min.   :1.000  
##  1st Qu.: 0.000           1st Qu.:1.000  
##  Median : 0.000           Median :1.000  
##  Mean   : 2.546           Mean   :1.456  
##  3rd Qu.: 3.000           3rd Qu.:2.000  
##  Max.   :20.000           Max.   :2.000
summary(flight_cluster$satisfaction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.456   2.000   2.000

Preparing the dataset for PAM clustering

flight_PAM <- read.csv('test.csv', header = TRUE)
flight_PAM$Gender<-as.factor(flight_PAM$Gender)
flight_PAM$Customer.Type<-as.factor(flight_PAM$Customer.Type)
flight_PAM$Type.of.Travel<-as.factor(flight_PAM$Type.of.Travel)
flight_PAM$Class<-as.factor(flight_PAM$Class)
flight_PAM$satisfaction<-as.factor(flight_PAM$satisfaction)
flight_PAM$Arrival.Delay.in.Minutes<-as.integer(flight_PAM$Arrival.Delay.in.Minutes)
str(flight_PAM) 
## 'data.frame':    25976 obs. of  25 variables:
##  $ X                                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ id                               : int  19556 90035 12360 77959 36875 39177 79433 97286 27508 62482 ...
##  $ Gender                           : Factor w/ 2 levels "Female","Male": 1 1 2 2 1 2 1 1 2 1 ...
##  $ Customer.Type                    : Factor w/ 2 levels "disloyal Customer",..: 2 2 1 2 2 2 2 2 2 2 ...
##  $ Age                              : int  52 36 20 44 49 16 77 43 47 46 ...
##  $ Type.of.Travel                   : Factor w/ 2 levels "Business travel",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Class                            : Factor w/ 3 levels "Business","Eco",..: 2 1 2 1 2 2 1 1 2 1 ...
##  $ Flight.Distance                  : int  160 2863 192 3377 1182 311 3987 2556 556 1744 ...
##  $ Inflight.wifi.service            : int  5 1 2 0 2 3 5 2 5 2 ...
##  $ Departure.Arrival.time.convenient: int  4 1 0 0 3 3 5 2 2 2 ...
##  $ Ease.of.Online.booking           : int  3 3 2 0 4 3 5 2 2 2 ...
##  $ Gate.location                    : int  4 1 4 2 3 3 5 2 2 2 ...
##  $ Food.and.drink                   : int  3 5 2 3 4 5 3 4 5 3 ...
##  $ Online.boarding                  : int  4 4 2 4 1 5 5 4 5 4 ...
##  $ Seat.comfort                     : int  3 5 2 4 2 3 5 5 5 4 ...
##  $ Inflight.entertainment           : int  5 4 2 1 2 5 5 4 5 4 ...
##  $ On.board.service                 : int  5 4 4 1 2 4 5 4 2 4 ...
##  $ Leg.room.service                 : int  5 4 1 1 2 3 5 4 2 4 ...
##  $ Baggage.handling                 : int  5 4 3 1 2 1 5 4 5 4 ...
##  $ Checkin.service                  : int  2 3 2 3 4 1 4 5 3 5 ...
##  $ Inflight.service                 : int  5 4 2 1 2 2 5 4 3 4 ...
##  $ Cleanliness                      : int  5 5 2 4 4 5 3 3 5 4 ...
##  $ Departure.Delay.in.Minutes       : int  50 0 0 0 0 0 0 77 1 28 ...
##  $ Arrival.Delay.in.Minutes         : int  44 0 0 6 20 0 0 65 0 14 ...
##  $ satisfaction                     : Factor w/ 2 levels "neutral or dissatisfied",..: 2 2 1 2 2 2 2 2 2 2 ...
iqr <- IQR(flight_PAM$Departure.Delay.in.Minutes)
Q <- quantile(flight_PAM$Departure.Delay.in.Minutes, probs=c(.25, .75), na.rm = FALSE)
flight_PAM<- subset(flight_PAM, flight_PAM$Departure.Delay.in.Minutes > (Q[1] - 2.5*iqr) & flight_PAM$Departure.Delay.in.Minutes < (Q[2]+2.5*iqr))
iqr2 <- IQR(flight_PAM$Arrival.Delay.in.Minutes, na.rm = TRUE)
Q1 <- quantile(flight_PAM$Arrival.Delay.in.Minutes, probs=c(.25, .75), na.rm = TRUE)
flight_PAM<- subset(flight_PAM, flight_PAM$Arrival.Delay.in.Minutes > (Q1[1] - 2.5*iqr2) & flight_PAM$Arrival.Delay.in.Minutes < (Q1[2]+2.5*iqr2))
summary(flight_PAM)
##        X               id            Gender                Customer.Type  
##  Min.   :    1   Min.   :    17   Female:10603   disloyal Customer: 3810  
##  1st Qu.: 6544   1st Qu.: 32899   Male  :10309   Loyal Customer   :17102  
##  Median :12978   Median : 65920                                           
##  Mean   :13005   Mean   : 65379                                           
##  3rd Qu.:19521   3rd Qu.: 96972                                           
##  Max.   :25975   Max.   :129877                                           
##       Age                Type.of.Travel       Class       Flight.Distance
##  Min.   : 7.00   Business travel:14461   Business:10157   Min.   :  31   
##  1st Qu.:27.00   Personal Travel: 6451   Eco     : 9228   1st Qu.: 409   
##  Median :40.00                           Eco Plus: 1527   Median : 853   
##  Mean   :39.67                                            Mean   :1197   
##  3rd Qu.:51.00                                            3rd Qu.:1750   
##  Max.   :85.00                                            Max.   :4983   
##  Inflight.wifi.service Departure.Arrival.time.convenient Ease.of.Online.booking
##  Min.   :0.000         Min.   :0.000                     Min.   :0.000         
##  1st Qu.:2.000         1st Qu.:2.000                     1st Qu.:2.000         
##  Median :3.000         Median :3.000                     Median :3.000         
##  Mean   :2.735         Mean   :3.047                     Mean   :2.758         
##  3rd Qu.:4.000         3rd Qu.:4.000                     3rd Qu.:4.000         
##  Max.   :5.000         Max.   :5.000                     Max.   :5.000         
##  Gate.location   Food.and.drink  Online.boarding  Seat.comfort 
##  Min.   :1.000   Min.   :0.000   Min.   :0.000   Min.   :1.00  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.00  
##  Median :3.000   Median :3.000   Median :4.000   Median :4.00  
##  Mean   :2.967   Mean   :3.232   Mean   :3.282   Mean   :3.47  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:5.00  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.00  
##  Inflight.entertainment On.board.service Leg.room.service Baggage.handling
##  Min.   :0.000          Min.   :0.000    Min.   :0.000    Min.   :1.000   
##  1st Qu.:2.000          1st Qu.:3.000    1st Qu.:2.000    1st Qu.:3.000   
##  Median :4.000          Median :4.000    Median :4.000    Median :4.000   
##  Mean   :3.379          Mean   :3.408    Mean   :3.356    Mean   :3.644   
##  3rd Qu.:5.000          3rd Qu.:4.000    3rd Qu.:4.000    3rd Qu.:5.000   
##  Max.   :5.000          Max.   :5.000    Max.   :5.000    Max.   :5.000   
##  Checkin.service Inflight.service  Cleanliness    Departure.Delay.in.Minutes
##  Min.   :1.000   Min.   :0.000    Min.   :1.000   Min.   : 0.000            
##  1st Qu.:3.000   1st Qu.:3.000    1st Qu.:2.000   1st Qu.: 0.000            
##  Median :3.000   Median :4.000    Median :3.000   Median : 0.000            
##  Mean   :3.331   Mean   :3.677    Mean   :3.304   Mean   : 3.103            
##  3rd Qu.:4.000   3rd Qu.:5.000    3rd Qu.:4.000   3rd Qu.: 3.000            
##  Max.   :5.000   Max.   :5.000    Max.   :5.000   Max.   :41.000            
##  Arrival.Delay.in.Minutes                  satisfaction  
##  Min.   : 0.000           neutral or dissatisfied:11367  
##  1st Qu.: 0.000           satisfied              : 9545  
##  Median : 0.000                                          
##  Mean   : 2.546                                          
##  3rd Qu.: 3.000                                          
##  Max.   :20.000
summary(flight_PAM$satisfaction)
## neutral or dissatisfied               satisfied 
##                   11367                    9545

Viewing the Distribution of Customer Satisfaction

hist(flight_cluster$satisfaction, data=flight_cluster, main="Distribution of Satisfaction", xlab="Customer Satisfaction")
## Warning in plot.window(xlim, ylim, "", ...): "data" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "data"
## is not a graphical parameter
## Warning in axis(1, ...): "data" is not a graphical parameter
## Warning in axis(2, ...): "data" is not a graphical parameter

Reducing the Data allowing us to Process Results without Issue.

set.seed(1)
smaller.index <-createDataPartition(flight_cluster$satisfaction, p = 0.15, list = FALSE)
 #Using create data partition function due to uneven distribution of satisfaction.
flight_clean <- flight_cluster[smaller.index, ]#could not interpret the entire data set due to processing power limitations.
smaller.indexPAM <-createDataPartition(flight_PAM$satisfaction, p = 0.15, list = FALSE) #This is for PAM Clustering
flight_clean_PAM <- flight_PAM[smaller.indexPAM, ]

Establishing the Testing Cluster Dataset

set.seed(1)
smaller.index.test <-createDataPartition(flight_clean$satisfaction, p = 0.95, list = FALSE)
flight_clean_test <- flight_cluster[smaller.index.test, ]

Cleaning from NA Values

colSums(is.na(flight_clean)) #only four observations that are missing values.
##                                 X                                id 
##                                 0                                 0 
##                            Gender                     Customer.Type 
##                                 0                                 0 
##                               Age                    Type.of.Travel 
##                                 0                                 0 
##                             Class                   Flight.Distance 
##                                 0                                 0 
##             Inflight.wifi.service Departure.Arrival.time.convenient 
##                                 0                                 0 
##            Ease.of.Online.booking                     Gate.location 
##                                 0                                 0 
##                    Food.and.drink                   Online.boarding 
##                                 0                                 0 
##                      Seat.comfort            Inflight.entertainment 
##                                 0                                 0 
##                  On.board.service                  Leg.room.service 
##                                 0                                 0 
##                  Baggage.handling                   Checkin.service 
##                                 0                                 0 
##                  Inflight.service                       Cleanliness 
##                                 0                                 0 
##        Departure.Delay.in.Minutes          Arrival.Delay.in.Minutes 
##                                 0                                 0 
##                      satisfaction 
##                                 0
flight_clean_PAM <- na.omit(flight_clean_PAM)
flight_clean <- na.omit(flight_clean)
flight_clean_test <- na.omit(flight_clean_test)

Removing the Row ID and Customer ID, then Normalizing the Data

flight_clean <- flight_clean[-c(1,2)]
flight_clean_PAM <- flight_clean_PAM[-c(1,2)]
flight_clean_test <- flight_clean_test[-c(1,2)]
head(flight_clean,3)
##    Gender Customer.Type Age Type.of.Travel Class Flight.Distance
## 17      1             2  31              1     2             728
## 33      2             1  41              1     2             624
## 48      1             2  59              2     2             460
##    Inflight.wifi.service Departure.Arrival.time.convenient
## 17                     2                                 5
## 33                     2                                 3
## 48                     2                                 5
##    Ease.of.Online.booking Gate.location Food.and.drink Online.boarding
## 17                      5             5              2               2
## 33                      2             4              5               2
## 48                      2             5              3               3
##    Seat.comfort Inflight.entertainment On.board.service Leg.room.service
## 17            2                      2                4                3
## 33            5                      5                4                3
## 48            4                      5                5                2
##    Baggage.handling Checkin.service Inflight.service Cleanliness
## 17                3               4                3           2
## 33                3               1                4           5
## 48                4               1                5           2
##    Departure.Delay.in.Minutes Arrival.Delay.in.Minutes satisfaction
## 17                          2                        0            1
## 33                          0                        0            1
## 48                          4                        2            1
flight_clean_standardized<-as.data.frame(scale(flight_clean[1:23])) #normalizing data set considering there are multiple different measurements included in the data.
flight_clean__test_standardized<-as.data.frame(scale(flight_clean_test[1:23]))
flight_clean_PAM_standardized<-as.data.frame(scale(flight_clean_PAM[6:22]))

K-Means clustering

Distance Matrix
distance <- get_dist(flight_clean_standardized, method = "euclidean")
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

K-means clusters
k2 <- kmeans(flight_clean_standardized, centers = 2, nstart=10, iter.max=10)
k3 <- kmeans(flight_clean_standardized, centers = 3, nstart = 25, iter.max = 10)
k4 <- kmeans(flight_clean_standardized, centers = 4, nstart = 25, iter.max = 10)
k5 <- kmeans(flight_clean_standardized, centers = 5, nstart = 25, iter.max = 10)
k10 <- kmeans(flight_clean_standardized, centers = 10, nstart = 25, iter.max = 10)
# plots to compare
p2 <- fviz_cluster(k2, geom = "point", data = flight_clean_standardized) + ggtitle("k = 2")
p3 <- fviz_cluster(k3, geom = "point",  data = flight_clean_standardized) + ggtitle("k = 3")
p4 <- fviz_cluster(k4, geom = "point",  data = flight_clean_standardized) + ggtitle("k = 4")
p5 <- fviz_cluster(k5, geom = "point",  data = flight_clean_standardized) + ggtitle("k = 5")
p10 <- fviz_cluster(k10, geom = "point",  data = flight_clean_standardized) + ggtitle("k = 10")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p2, p3, p4, p5, p10, nrow = 3)

Elbow Method and Average Silhouette Method:
my1<-fviz_nbclust(flight_clean_standardized, kmeans, method="wss")
my2<-fviz_nbclust(flight_clean_standardized, kmeans, method = "silhouette")
grid.arrange(my1, my2, nrow = 2)

GAP Statistic Graph
gap_stat <- clusGap(flight_clean_standardized, FUN = kmeans, nstart = 10,
                    K.max = 10, B = 50)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 156850)
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 156850)
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations
fviz_gap_stat(gap_stat)

RSQ vs Silhouette
kmeans_perf2 = function(data,maxc,ns)
{
  result = as.data.frame(matrix(ncol=3, nrow=maxc-1))
  colnames(result) = c("clusters", "rsq","silhouette")
  dst <- daisy(data)
  for(i in 2:maxc) {
    cst <- kmeans(data,i,iter.max=100,nstart=ns)
    rsq <- 1-cst$tot.withinss/(cst$totss)
    slht <- silhouette(cst$cluster,dst)
    result[i-1,]=c(i,rsq,mean(slht[,3]))
  }
  ggplot(result, aes(clusters)) + 
    geom_line(aes(y = rsq, colour = "rsq")) + 
    geom_line(aes(y = silhouette, colour = "silhouette"))
  
}
kmeans_perf2(flight_clean_standardized,15,ns=10)
## Warning in daisy(data): binary variable(s) 1, 2, 4, 23 treated as interval
## scaled

Final Cluster
set.seed(1)
final <- kmeans(flight_clean_standardized, 2, nstart = 25, iter.max = 10)
#print(final)
fviz_cluster(final, geom = "point",data = flight_clean_standardized)

Final Cluster Results
flight_clean_standardized %>%
  mutate(Cluster = final$cluster) %>%
  group_by(Cluster) %>%
  summarise_all("mean")
## # A tibble: 2 × 24
##   Cluster  Gender Customer.Type    Age Type.of.Travel  Class Flight.Distance
##     <int>   <dbl>         <dbl>  <dbl>          <dbl>  <dbl>           <dbl>
## 1       1 -0.0212       -0.0994 -0.117          0.327  0.401          -0.261
## 2       2  0.0215        0.101   0.118         -0.331 -0.406           0.265
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## #   Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## #   Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## #   Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## #   Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## #   Inflight.service <dbl>, Cleanliness <dbl>,
## #   Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
flight_clean_standardized$Cluster<-as.factor(final$cluster)
flight_clean$Cluster<-as.factor(final$cluster) #adding as a factor variable to the original data set so you can use it for supervised learning.

flight_clean %>%
  group_by(Cluster) %>%
  summarise_all("mean")
## # A tibble: 2 × 24
##   Cluster Gender Customer.Type   Age Type.of.Travel Class Flight.Distance
##   <fct>    <dbl>         <dbl> <dbl>          <dbl> <dbl>           <dbl>
## 1 1         1.49          1.78  37.5           1.48  1.85            920.
## 2 2         1.51          1.86  41.1           1.17  1.35           1446.
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## #   Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## #   Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## #   Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## #   Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## #   Inflight.service <dbl>, Cleanliness <dbl>,
## #   Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
k_results <- flight_clean %>%
  mutate(cluster = final$clustering) %>%
  group_by(Cluster) %>%
  do(the_summary = summary(.))
k_results$the_summary
## [[1]]
##      Gender      Customer.Type        Age        Type.of.Travel      Class     
##  Min.   :1.000   Min.   :1.000   Min.   : 7.00   Min.   :1.000   Min.   :1.00  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:25.00   1st Qu.:1.000   1st Qu.:1.00  
##  Median :1.000   Median :2.000   Median :37.00   Median :1.000   Median :2.00  
##  Mean   :1.488   Mean   :1.778   Mean   :37.51   Mean   :1.475   Mean   :1.85  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:49.00   3rd Qu.:2.000   3rd Qu.:2.00  
##  Max.   :2.000   Max.   :2.000   Max.   :80.00   Max.   :2.000   Max.   :3.00  
##  Flight.Distance  Inflight.wifi.service Departure.Arrival.time.convenient
##  Min.   :  67.0   Min.   :0.00          Min.   :0.000                    
##  1st Qu.: 368.0   1st Qu.:2.00          1st Qu.:2.000                    
##  Median : 666.0   Median :2.00          Median :3.000                    
##  Mean   : 919.7   Mean   :2.33          Mean   :3.079                    
##  3rd Qu.:1107.0   3rd Qu.:3.00          3rd Qu.:4.000                    
##  Max.   :4243.0   Max.   :5.00          Max.   :5.000                    
##  Ease.of.Online.booking Gate.location   Food.and.drink  Online.boarding
##  Min.   :0.000          Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:2.000          1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000          Median :3.000   Median :2.000   Median :3.000  
##  Mean   :2.503          Mean   :2.944   Mean   :2.558   Mean   :2.587  
##  3rd Qu.:3.000          3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000          Max.   :5.000   Max.   :5.000   Max.   :5.000  
##   Seat.comfort   Inflight.entertainment On.board.service Leg.room.service
##  Min.   :1.000   Min.   :1.000          Min.   :1.00     Min.   :0.00    
##  1st Qu.:2.000   1st Qu.:2.000          1st Qu.:2.00     1st Qu.:2.00    
##  Median :3.000   Median :2.000          Median :3.00     Median :3.00    
##  Mean   :2.694   Mean   :2.437          Mean   :2.89     Mean   :2.88    
##  3rd Qu.:4.000   3rd Qu.:3.000          3rd Qu.:4.00     3rd Qu.:4.00    
##  Max.   :5.000   Max.   :5.000          Max.   :5.00     Max.   :5.00    
##  Baggage.handling Checkin.service Inflight.service  Cleanliness   
##  Min.   :1.000    Min.   :1.000   Min.   :1.000    Min.   :1.000  
##  1st Qu.:2.000    1st Qu.:2.000   1st Qu.:2.000    1st Qu.:2.000  
##  Median :3.000    Median :3.000   Median :3.000    Median :2.000  
##  Mean   :3.206    Mean   :2.967   Mean   :3.267    Mean   :2.529  
##  3rd Qu.:4.000    3rd Qu.:4.000   3rd Qu.:4.000    3rd Qu.:3.000  
##  Max.   :5.000    Max.   :5.000   Max.   :5.000    Max.   :5.000  
##  Departure.Delay.in.Minutes Arrival.Delay.in.Minutes  satisfaction   Cluster 
##  Min.   : 0.000             Min.   : 0.000           Min.   :1.000   1:1578  
##  1st Qu.: 0.000             1st Qu.: 0.000           1st Qu.:1.000   2:   0  
##  Median : 0.000             Median : 0.000           Median :1.000           
##  Mean   : 3.523             Mean   : 2.963           Mean   :1.115           
##  3rd Qu.: 4.000             3rd Qu.: 4.000           3rd Qu.:1.000           
##  Max.   :41.000             Max.   :20.000           Max.   :2.000           
## 
## [[2]]
##      Gender      Customer.Type        Age        Type.of.Travel      Class     
##  Min.   :1.000   Min.   :1.000   Min.   : 7.00   Min.   :1.000   Min.   :1.00  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:30.00   1st Qu.:1.000   1st Qu.:1.00  
##  Median :2.000   Median :2.000   Median :42.00   Median :1.000   Median :1.00  
##  Mean   :1.509   Mean   :1.855   Mean   :41.09   Mean   :1.168   Mean   :1.35  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:52.00   3rd Qu.:1.000   3rd Qu.:2.00  
##  Max.   :2.000   Max.   :2.000   Max.   :80.00   Max.   :2.000   Max.   :3.00  
##  Flight.Distance Inflight.wifi.service Departure.Arrival.time.convenient
##  Min.   :  31    Min.   :0.000         Min.   :0.000                    
##  1st Qu.: 489    1st Qu.:2.000         1st Qu.:2.000                    
##  Median :1085    Median :3.000         Median :3.000                    
##  Mean   :1446    Mean   :3.186         Mean   :3.128                    
##  3rd Qu.:2264    3rd Qu.:4.000         3rd Qu.:4.000                    
##  Max.   :4983    Max.   :5.000         Max.   :5.000                    
##  Ease.of.Online.booking Gate.location   Food.and.drink  Online.boarding
##  Min.   :0.000          Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:2.000          1st Qu.:2.000   1st Qu.:3.000   1st Qu.:4.000  
##  Median :3.000          Median :3.000   Median :4.000   Median :4.000  
##  Mean   :3.097          Mean   :3.037   Mean   :3.875   Mean   :3.963  
##  3rd Qu.:4.000          3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000          Max.   :5.000   Max.   :5.000   Max.   :5.000  
##   Seat.comfort   Inflight.entertainment On.board.service Leg.room.service
##  Min.   :1.000   Min.   :1.000          Min.   :1.000    Min.   :1.000   
##  1st Qu.:4.000   1st Qu.:4.000          1st Qu.:3.000    1st Qu.:3.000   
##  Median :4.000   Median :4.000          Median :4.000    Median :4.000   
##  Mean   :4.219   Mean   :4.314          Mean   :3.917    Mean   :3.832   
##  3rd Qu.:5.000   3rd Qu.:5.000          3rd Qu.:5.000    3rd Qu.:5.000   
##  Max.   :5.000   Max.   :5.000          Max.   :5.000    Max.   :5.000   
##  Baggage.handling Checkin.service Inflight.service  Cleanliness   
##  Min.   :1.000    Min.   :1.000   Min.   :1.000    Min.   :1.000  
##  1st Qu.:4.000    1st Qu.:3.000   1st Qu.:4.000    1st Qu.:4.000  
##  Median :4.000    Median :4.000   Median :4.000    Median :4.000  
##  Mean   :4.117    Mean   :3.702   Mean   :4.128    Mean   :4.091  
##  3rd Qu.:5.000    3rd Qu.:5.000   3rd Qu.:5.000    3rd Qu.:5.000  
##  Max.   :5.000    Max.   :5.000   Max.   :5.000    Max.   :5.000  
##  Departure.Delay.in.Minutes Arrival.Delay.in.Minutes  satisfaction   Cluster 
##  Min.   : 0.000             Min.   : 0.000           Min.   :1.000   1:   0  
##  1st Qu.: 0.000             1st Qu.: 0.000           1st Qu.:2.000   2:1559  
##  Median : 0.000             Median : 0.000           Median :2.000           
##  Mean   : 2.836             Mean   : 2.261           Mean   :1.758           
##  3rd Qu.: 3.000             3rd Qu.: 1.000           3rd Qu.:2.000           
##  Max.   :38.000             Max.   :20.000           Max.   :2.000
flight_clean %>%
  group_by(Cluster) %>%
  summarise_all("median")
## # A tibble: 2 × 24
##   Cluster Gender Customer.Type   Age Type.of.Travel Class Flight.Distance
##   <fct>    <dbl>         <dbl> <dbl>          <dbl> <dbl>           <dbl>
## 1 1            1             2    37              1     2             666
## 2 2            2             2    42              1     1            1085
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## #   Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## #   Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## #   Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## #   Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## #   Inflight.service <dbl>, Cleanliness <dbl>,
## #   Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
Parallel Coordinate Plot
flight_clean %>%
  arrange(desc(Cluster)) %>%
  ggparcoord(
    columns = 1:23, groupColumn = "Cluster", order = "anyClass",
    showPoints = TRUE, 
    title = "Original",
    alphaLines = 1
    ) + 
  scale_color_manual(values=c( "#69b3a2", "#E8E8E8", "#E8E8E8") ) +
  theme(
    legend.position="Default",
    plot.title = element_text(size=10)
  ) +
  xlab("")+
  theme(axis.text.x = element_text(angle = 90))

myclustergraph <- ggparcoord(data = flight_clean, columns = c(1:23), groupColumn = "Cluster", scale = "std") + labs(x = "Flight Variables", y = "value (in standard-deviation units)",par(las=2), title = "Clustering")+
   theme(axis.text.x = element_text(angle = 90))
ggplotly(myclustergraph)
Proportion of Satisfaction in Each Cluster
ddataCat <- flight_clean %>%
mutate(cluster = final$cluster) 
#print()
DemoClusterJoin <- data.frame(flight_clean$satisfaction, ddataCat$cluster)
table00<-as.matrix(table(DemoClusterJoin))
table01<-100*prop.table(table00,2)
print(table01) #displaying that clearly the majority of satisfied customers fall in the second cluster.
##                          ddataCat.cluster
## flight_clean.satisfaction        1        2
##                         1 88.52978 24.18217
##                         2 11.47022 75.81783
Testing Cluster
set.seed(1)
final_test <- kmeans(flight_clean__test_standardized, 2, nstart = 25, iter.max = 10)
#print(final_test)
fviz_cluster(final_test, geom = "point",data = flight_clean__test_standardized)

K-Means Test Cluster Results
flight_clean__test_standardized %>%
  mutate(Cluster = final_test$cluster) %>%
  group_by(Cluster) %>%
  summarise_all("mean")
## # A tibble: 2 × 24
##   Cluster   Gender Customer.Type    Age Type.of.Travel  Class Flight.Distance
##     <int>    <dbl>         <dbl>  <dbl>          <dbl>  <dbl>           <dbl>
## 1       1 -0.00755         0.163  0.168         -0.396 -0.499           0.311
## 2       2  0.00646        -0.140 -0.144          0.339  0.427          -0.267
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## #   Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## #   Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## #   Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## #   Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## #   Inflight.service <dbl>, Cleanliness <dbl>,
## #   Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …
flight_clean_test$Cluster<-as.factor(final_test$cluster) #adding as a factor variable to the original data set so you can use it for supervised learning.
flight_clean %>%
  group_by(Cluster) %>%
  summarise_all("mean")
## # A tibble: 2 × 24
##   Cluster Gender Customer.Type   Age Type.of.Travel Class Flight.Distance
##   <fct>    <dbl>         <dbl> <dbl>          <dbl> <dbl>           <dbl>
## 1 1         1.49          1.78  37.5           1.48  1.85            920.
## 2 2         1.51          1.86  41.1           1.17  1.35           1446.
## # … with 17 more variables: Inflight.wifi.service <dbl>,
## #   Departure.Arrival.time.convenient <dbl>, Ease.of.Online.booking <dbl>,
## #   Gate.location <dbl>, Food.and.drink <dbl>, Online.boarding <dbl>,
## #   Seat.comfort <dbl>, Inflight.entertainment <dbl>, On.board.service <dbl>,
## #   Leg.room.service <dbl>, Baggage.handling <dbl>, Checkin.service <dbl>,
## #   Inflight.service <dbl>, Cleanliness <dbl>,
## #   Departure.Delay.in.Minutes <dbl>, Arrival.Delay.in.Minutes <dbl>, …

PAM

Distance for PAM
gower_dist<-daisy(flight_clean_PAM_standardized,metric="gower")
fviz_dist(gower_dist, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

PAM optimal clusters
fviz_nbclust(flight_clean_standardized[,-24], FUN = pam, method = "silhouette")

fviz_nbclust(flight_clean_standardized[,-24], FUN = pam, method = "wss")

PAM Cluster Results
k<-2
pam_fit <- pam(gower_dist, diss = TRUE, k)

flight_clean_PAM$Cluster<-as.factor(pam_fit$cluster)


pam_results <- flight_clean_PAM %>%
  mutate(cluster = pam_fit$clustering) %>%
  group_by(cluster) %>%
  do(the_summary = summary(.))
pam_results$the_summary
## [[1]]
##     Gender              Customer.Type      Age               Type.of.Travel
##  Female:565   disloyal Customer:232   Min.   : 7.0   Business travel:646   
##  Male  :556   Loyal Customer   :889   1st Qu.:25.0   Personal Travel:475   
##                                       Median :38.0                         
##                                       Mean   :38.2                         
##                                       3rd Qu.:51.0                         
##                                       Max.   :85.0                         
##       Class     Flight.Distance  Inflight.wifi.service
##  Business:375   Min.   :  67.0   Min.   :0.000        
##  Eco     :617   1st Qu.: 384.0   1st Qu.:2.000        
##  Eco Plus:129   Median : 696.0   Median :2.000        
##                 Mean   : 969.6   Mean   :2.401        
##                 3rd Qu.:1235.0   3rd Qu.:3.000        
##                 Max.   :3998.0   Max.   :5.000        
##  Departure.Arrival.time.convenient Ease.of.Online.booking Gate.location  
##  Min.   :0.000                     Min.   :0.000          Min.   :1.000  
##  1st Qu.:2.000                     1st Qu.:2.000          1st Qu.:2.000  
##  Median :3.000                     Median :3.000          Median :3.000  
##  Mean   :3.056                     Mean   :2.664          Mean   :3.003  
##  3rd Qu.:4.000                     3rd Qu.:4.000          3rd Qu.:4.000  
##  Max.   :5.000                     Max.   :5.000          Max.   :5.000  
##  Food.and.drink  Online.boarding  Seat.comfort   Inflight.entertainment
##  Min.   :0.000   Min.   :0.000   Min.   :1.000   Min.   :1.000         
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000         
##  Median :2.000   Median :3.000   Median :2.000   Median :2.000         
##  Mean   :2.235   Mean   :2.759   Mean   :2.354   Mean   :1.951         
##  3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:2.000         
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000         
##  On.board.service Leg.room.service Baggage.handling Checkin.service
##  Min.   :1.000    Min.   :0.000    Min.   :1.00     Min.   :1.000  
##  1st Qu.:2.000    1st Qu.:2.000    1st Qu.:2.00     1st Qu.:2.000  
##  Median :3.000    Median :3.000    Median :3.00     Median :3.000  
##  Mean   :2.722    Mean   :2.797    Mean   :3.12     Mean   :2.968  
##  3rd Qu.:4.000    3rd Qu.:4.000    3rd Qu.:4.00     3rd Qu.:4.000  
##  Max.   :5.000    Max.   :5.000    Max.   :5.00     Max.   :5.000  
##  Inflight.service  Cleanliness   Departure.Delay.in.Minutes
##  Min.   :1.000    Min.   :1.00   Min.   : 0.000            
##  1st Qu.:2.000    1st Qu.:1.00   1st Qu.: 0.000            
##  Median :3.000    Median :2.00   Median : 0.000            
##  Mean   :3.169    Mean   :2.16   Mean   : 3.277            
##  3rd Qu.:4.000    3rd Qu.:3.00   3rd Qu.: 3.000            
##  Max.   :5.000    Max.   :5.00   Max.   :39.000            
##  Arrival.Delay.in.Minutes                  satisfaction Cluster     cluster 
##  Min.   : 0.00            neutral or dissatisfied:900   1:1121   Min.   :1  
##  1st Qu.: 0.00            satisfied              :221   2:   0   1st Qu.:1  
##  Median : 0.00                                                   Median :1  
##  Mean   : 2.89                                                   Mean   :1  
##  3rd Qu.: 4.00                                                   3rd Qu.:1  
##  Max.   :20.00                                                   Max.   :1  
## 
## [[2]]
##     Gender               Customer.Type       Age                Type.of.Travel
##  Female:1020   disloyal Customer: 332   Min.   : 7.00   Business travel:1494  
##  Male  : 997   Loyal Customer   :1685   1st Qu.:29.00   Personal Travel: 523  
##                                         Median :41.00                         
##                                         Mean   :40.18                         
##                                         3rd Qu.:51.00                         
##                                         Max.   :85.00                         
##       Class      Flight.Distance Inflight.wifi.service
##  Business:1159   Min.   :  31    Min.   :0.000        
##  Eco     : 737   1st Qu.: 453    1st Qu.:2.000        
##  Eco Plus: 121   Median : 937    Median :3.000        
##                  Mean   :1312    Mean   :2.917        
##                  3rd Qu.:2062    3rd Qu.:4.000        
##                  Max.   :4963    Max.   :5.000        
##  Departure.Arrival.time.convenient Ease.of.Online.booking Gate.location  
##  Min.   :0.000                     Min.   :0.000          Min.   :1.000  
##  1st Qu.:2.000                     1st Qu.:2.000          1st Qu.:2.000  
##  Median :3.000                     Median :3.000          Median :3.000  
##  Mean   :3.046                     Mean   :2.813          Mean   :2.963  
##  3rd Qu.:4.000                     3rd Qu.:4.000          3rd Qu.:4.000  
##  Max.   :5.000                     Max.   :5.000          Max.   :5.000  
##  Food.and.drink  Online.boarding  Seat.comfort   Inflight.entertainment
##  Min.   :1.000   Min.   :0.000   Min.   :1.000   Min.   :1.000         
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:4.000         
##  Median :4.000   Median :4.000   Median :4.000   Median :4.000         
##  Mean   :3.784   Mean   :3.537   Mean   :4.064   Mean   :4.206         
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000         
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000         
##  On.board.service Leg.room.service Baggage.handling Checkin.service
##  Min.   :1.000    Min.   :0.000    Min.   :1.000    Min.   :1.000  
##  1st Qu.:3.000    1st Qu.:3.000    1st Qu.:4.000    1st Qu.:3.000  
##  Median :4.000    Median :4.000    Median :4.000    Median :4.000  
##  Mean   :3.804    Mean   :3.674    Mean   :3.971    Mean   :3.517  
##  3rd Qu.:5.000    3rd Qu.:5.000    3rd Qu.:5.000    3rd Qu.:5.000  
##  Max.   :5.000    Max.   :5.000    Max.   :5.000    Max.   :5.000  
##  Inflight.service  Cleanliness    Departure.Delay.in.Minutes
##  Min.   :1.000    Min.   :1.000   Min.   : 0.000            
##  1st Qu.:4.000    1st Qu.:3.000   1st Qu.: 0.000            
##  Median :4.000    Median :4.000   Median : 0.000            
##  Mean   :3.986    Mean   :3.953   Mean   : 2.798            
##  3rd Qu.:5.000    3rd Qu.:5.000   3rd Qu.: 2.000            
##  Max.   :5.000    Max.   :5.000   Max.   :39.000            
##  Arrival.Delay.in.Minutes                  satisfaction  Cluster     cluster 
##  Min.   : 0.000           neutral or dissatisfied: 806   1:   0   Min.   :2  
##  1st Qu.: 0.000           satisfied              :1211   2:2017   1st Qu.:2  
##  Median : 0.000                                                   Median :2  
##  Mean   : 2.339                                                   Mean   :2  
##  3rd Qu.: 2.000                                                   3rd Qu.:2  
##  Max.   :20.000                                                   Max.   :2

Again, very similar results compared to the k-means cluster test and k-means clusters.

Display PAM Cluster
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)

tsne_data <- tsne_obj$Y %>%
  data.frame() %>%
  setNames(c("X", "Y")) %>%
  mutate(cluster = factor(pam_fit$clustering),
         name = flight_clean_PAM_standardized$satisfaction)

ggplot(aes(x = X, y = Y), data = tsne_data) +
  geom_point(aes(color = cluster))

ddataCat2 <- flight_clean_PAM %>%
mutate(cluster = pam_fit$cluster)
#print()
DemoClusterJoin1 <- data.frame(flight_clean_PAM$satisfaction, ddataCat2$cluster)
table03<-as.matrix(table(DemoClusterJoin1))
table04<-100*prop.table(table03,2)
print(table04)
##                              ddataCat2.cluster
## flight_clean_PAM.satisfaction        1        2
##       neutral or dissatisfied 80.28546 39.96034
##       satisfied               19.71454 60.03966

Hierarchial Clustering

set.seed(2)
smaller.index <-createDataPartition(flight_clean_standardized$satisfaction, p = 0.35, list = FALSE)
 #Reducing the partition of the already partitioned data due to processing issues from hierarchical clustering
flight_clean_h <- flight_cluster[smaller.index, ]
colSums(is.na(flight_clean_h))
##                                 X                                id 
##                                 0                                 0 
##                            Gender                     Customer.Type 
##                                 0                                 0 
##                               Age                    Type.of.Travel 
##                                 0                                 0 
##                             Class                   Flight.Distance 
##                                 0                                 0 
##             Inflight.wifi.service Departure.Arrival.time.convenient 
##                                 0                                 0 
##            Ease.of.Online.booking                     Gate.location 
##                                 0                                 0 
##                    Food.and.drink                   Online.boarding 
##                                 0                                 0 
##                      Seat.comfort            Inflight.entertainment 
##                                 0                                 0 
##                  On.board.service                  Leg.room.service 
##                                 0                                 0 
##                  Baggage.handling                   Checkin.service 
##                                 0                                 0 
##                  Inflight.service                       Cleanliness 
##                                 0                                 0 
##        Departure.Delay.in.Minutes          Arrival.Delay.in.Minutes 
##                                 0                                 0 
##                      satisfaction 
##                                 0
flight_clean_h <- na.omit(flight_clean_h)
Analyzing which method to use.
methods <- c( "average", "single", "complete", "ward")
names(methods) <- c( "average", "single", "complete", "ward")
# 
ac <- function(x) {
agnes(flight_clean_h, method = x)$ac
}
map_dbl(methods, ac)
##   average    single  complete      ward 
## 0.9887540 0.7107700 0.9942590 0.9994965

Ward gives best results.

Hanging the Tree
hier_cluster_for_flight <- agnes(flight_clean_h, method = "ward")
pltree(hier_cluster_for_flight, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

Optimal Number of Clusters
fviz_nbclust(flight_clean_h, FUN = hcut, method = "wss")

fviz_nbclust(flight_clean_h, FUN = hcut, method = "silhouette")

Number of Observations in Each Cluster
hier_flight_cluster <- cutree(hier_cluster_for_flight, k = 2)
table(hier_flight_cluster)
## hier_flight_cluster
##   1   2 
## 658 440
Results of Hierachal Clustering
aggregate(flight_clean_h[,-c(1,1)],list(hier_flight_cluster),mean)
##   Group.1        id   Gender Customer.Type      Age Type.of.Travel    Class
## 1       1  39688.85 1.500000      1.826748 39.91337       1.297872 1.615502
## 2       2 104149.08 1.527273      1.809091 40.90000       1.306818 1.513636
##   Flight.Distance Inflight.wifi.service Departure.Arrival.time.convenient
## 1        1205.764              2.822188                          3.118541
## 2        1291.159              2.779545                          3.211364
##   Ease.of.Online.booking Gate.location Food.and.drink Online.boarding
## 1               2.761398      2.962006        3.24924        3.284195
## 2               2.893182      3.034091        3.20000        3.322727
##   Seat.comfort Inflight.entertainment On.board.service Leg.room.service
## 1     3.483283               3.430091         3.348024         3.354103
## 2     3.484091               3.331818         3.436364         3.481818
##   Baggage.handling Checkin.service Inflight.service Cleanliness
## 1         3.559271        3.229483         3.711246    3.303951
## 2         3.704545        3.481818         3.759091    3.327273
##   Departure.Delay.in.Minutes Arrival.Delay.in.Minutes satisfaction
## 1                   2.319149                 2.246201     1.462006
## 2                   3.843182                 2.695455     1.450000